Merge branch 'fix/report_ca' into feature/uat-tmp

周伟奇
Showing 14 changed files with 2892 additions and 47 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/named_enum.py
src/apps/doc/ocr/wb.py
src/apps/doc/views.py
src/common/electronic_afc_contract/afc_contract_ocr.py
src/common/electronic_afc_contract/get_char_fsm.py
src/common/electronic_hil_contract/get_char_fsm.py
src/common/electronic_hil_contract/hil_contract_ocr.py
src/common/fsm_econtract/const.py
src/common/fsm_econtract/fsm_contract_ocr.py
src/common/fsm_econtract/hmh_ocr.py
src/common/fsm_econtract/retriever.py
src/common/fsm_econtract/tools.py
--- a/src/apps/doc/consts.py
View file @e0d31a2
+++ b/src/apps/doc/consts.py
View file @e0d31a2
@@ -11,7 +11,7 @@ PAGE_SIZE_DEFAULT = 10
 FIXED_APPLICATION_ID_PREFIX = 'CH-S'

 DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT']
-DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT']
+DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT', 'OVP']
 COMPARE_DOC_SCHEME_LIST = ['CA', 'SE']

 HIL_PREFIX = 'HIL'
@@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44
 HIL_CONTRACT_3_CN_NAME = '车辆处置协议'
 HIL_CONTRACT_3_CLASSIFY = 45

-CONTRACT_SET = {CONTRACT_QRS_CLASSIFY, CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY}
+FSM_CONTRACT_WEP_CN_NAME = '延长保修合同'
+FSM_CONTRACT_WEP_CLASSIFY = 51
+
+FSM_CONTRACT_MSI_CN_NAME = '长悦保养合同'
+FSM_CONTRACT_MSI_CLASSIFY = 52
+
+FSM_CONTRACT_SC_CN_NAME = '汽车销售合同'
+FSM_CONTRACT_SC_CLASSIFY = 53
+
+CONTRACT_SET = {
+    CONTRACT_QRS_CLASSIFY, 
+    CONTRACT_CLASSIFY, 
+    HIL_CONTRACT_1_CLASSIFY, 
+    HIL_CONTRACT_2_CLASSIFY, 
+    HIL_CONTRACT_3_CLASSIFY,
+    FSM_CONTRACT_WEP_CLASSIFY,
+    FSM_CONTRACT_MSI_CLASSIFY,
+    FSM_CONTRACT_SC_CLASSIFY, 
+}

 CONTRACT_MAP = {
    HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME,
@@ -1065,8 +1083,13 @@ CONTRACT_MAP = {
    HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME,
    CONTRACT_CLASSIFY: CONTRACT_CN_NAME,
    CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME,
+    FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_CN_NAME,
+    FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_CN_NAME,
+    FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_CN_NAME, 
 }

+FSM_CONTRACT_CLASSIFY_SET = {FSM_CONTRACT_WEP_CLASSIFY, FSM_CONTRACT_MSI_CLASSIFY, FSM_CONTRACT_SC_CLASSIFY}
+
 # 保单
 INSURANCE_CN_NAME = '保单'
 INSURANCE_CLASSIFY = 42
@@ -1214,6 +1237,11 @@ BS_FIELD = 'bss_ocr'
 HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr'
 HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr'
 HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr'
+FSM_CONTRACT_WEP_FIELD = 'fsm_wep_ocr'
+FSM_CONTRACT_MSI_FIELD = 'fsm_msi_ocr'
+FSM_CONTRACT_SC_FIELD = 'fsm_sc_ocr' 
+
+
 BS_CLASSIFY = 10089

 RESULT_MAPPING = {
@@ -1238,6 +1266,9 @@ RESULT_MAPPING = {
    HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD,
    HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD,
    HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD,
+    FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_FIELD,
+    FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_FIELD,
+    FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_FIELD, 
 }

 CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD)
@@ -2313,29 +2344,42 @@ APPLICANT_TYPE_MAP = {

 APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager']

-FILE_NAME_PREFIX_MAP = {
-    AFC_PREFIX: [
-        ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'),
-        ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
-    ],
-    HIL_PREFIX: [
-        ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'),
-        ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'),
-        ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
-    ]
-}
+# FILE_NAME_PREFIX_MAP = {
+#     AFC_PREFIX: [
+#         ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'),
+#         ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
+#     ],
+#     HIL_PREFIX: [
+#         ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'),
+#         ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'),
+#         ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
+#     ]
+# }

 ECONTRACT_KEYWORDS_MAP = {
    AFC_PREFIX: [
        ('抵押贷款合同', CONTRACT_CLASSIFY),
        ('送达地址确认书', CONTRACT_QRS_CLASSIFY),
-        # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0),
+        ('抵押登记豁免函', HMH_CLASSIFY),
    ],
    HIL_PREFIX: [
        ('售后回租合同', HIL_CONTRACT_1_CLASSIFY),
        ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY),
        ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY),
-        # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0),
+        ('抵押登记豁免函', HMH_CLASSIFY),
+    ]
+}
+
+FSM_ECONTRACT_KEYWORDS_MAP = {
+    AFC_PREFIX: [
+        ('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY),
+        ('长悦保养套餐服务合约', FSM_CONTRACT_MSI_CLASSIFY),
+        ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
+    ],
+    HIL_PREFIX: [
+        ('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY),
+        ('长悦保养套餐服务合同', FSM_CONTRACT_MSI_CLASSIFY),
+        ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
    ]
 }

@@ -2345,6 +2389,12 @@ HIL_CONTRACT_TYPE_MAP = {
    str(HIL_CONTRACT_3_CLASSIFY): 1,
 }

+FSM_CONTRACT_TYPE_MAP = {
+    str(FSM_CONTRACT_WEP_CLASSIFY): 0,
+    str(FSM_CONTRACT_MSI_CLASSIFY): 1,
+    str(FSM_CONTRACT_SC_CLASSIFY): 2,
+}
+
 RESULT_MAP = {
    0: None,
    1: True,
--- a/src/apps/doc/management/commands/ocr_process.py
View file @e0d31a2
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @e0d31a2
@@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g
 from common.tools.pdf_to_img import PDFHandler
 from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict
 from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict
+from common.fsm_econtract.fsm_contract_ocr import predict as fsm_predict
+from common.fsm_econtract.hmh_ocr import predict as hmh_predict
 from apps.doc import consts
 # from apps.doc.ocr.edms import EDMS, rh
 from apps.doc.ocr.ecm import ECM, rh
@@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin):
                    res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get(
                        consts.ALL_POSITION_KEY, {}).get(key1, [])
                license_summary[classify] = [res]
-            else:
+            elif classify in consts.SE_HIL_CON_MAP:  # TODO FSM新合同写入数据库用于比对
                res = {}
                for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items():
                    if pno1 is None:
@@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin):
                                self.log_base, traceback.format_exc()))
                            error_list.append(1)
                            return
-                else:  # e-contract
+                else:  # e-contract or or e-fsm-contract or e-hmh
                    try:
                        # pdf下载 处理 图片存储 识别
                        for times in range(consts.RETRY_TIMES):
@@ -1472,8 +1474,10 @@ class Command(BaseCommand, LoggerMixin):
                            self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
                                self.log_base, traceback.format_exc()))

+                        # AFC合同
                        if classify_1_str == str(consts.CONTRACT_CLASSIFY):
-                            ocr_result = afc_predict(pdf_handler.pdf_info)
+                            is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3]
+                            ocr_result = afc_predict(pdf_handler.pdf_info, is_fsm=is_fsm)
                            page_res = {}
                            for page_num, page_info in ocr_result.get('page_info', {}).items():
                                if isinstance(page_num, str) and page_num.startswith('page_'):
@@ -1483,6 +1487,7 @@ class Command(BaseCommand, LoggerMixin):
                                        'page_num': page_num,
                                        'page_info': page_info
                                    }
+                        # 送达地址确认书
                        elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY):
                            ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True)
                            page_num = 'page_1'
@@ -1493,9 +1498,11 @@ class Command(BaseCommand, LoggerMixin):
                                    'page_info': ocr_result.pop(page_num, {}) 
                                }
                            }
-                        else:
+                        # HIL合同
+                        elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP:
+                            is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3]
                            file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str)
-                            ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1)
+                            ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1, is_fsm=is_fsm)
                            rebuild_res_1 = {}
                            page_res = {}
                            for field_name, field_info in ocr_result_1.items():
@@ -1508,28 +1515,55 @@ class Command(BaseCommand, LoggerMixin):
                                        'page_num': page_num,
                                        'page_info': page_info
                                    }
+                        # FSM合同 WEP MSI SC
+                        elif classify_1_str in consts.FSM_CONTRACT_TYPE_MAP:
+                            file_type = consts.FSM_CONTRACT_TYPE_MAP.get(classify_1_str)
+                            ocr_result = fsm_predict(pdf_handler.pdf_info, file_type) 
+                            page_res = {}
+                            for page_num, page_info in ocr_result.items():
+                                if isinstance(page_num, str) and page_num.startswith('page_'):
+                                    page_res[page_num] = {
+                                        'classify': int(classify_1_str),
+                                        'page_num': page_num,
+                                        'page_info': page_info
+                                    }
+                        # hmh
+                        # else:
+                        #     pass
+

                        contract_res = {}
                        for img_path_tmp, page_key in pdf_handler.img_path_pno_list:
-                            if page_key in page_res:
+                            if classify_1_str == str(consts.HMH_CLASSIFY):
                                img_contract_res = {
-                                    'code': 1,
-                                    'data': [
-                                        {
-                                            'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY),
-                                            'data': page_res[page_key]
-                                        }
-                                    ]
-                                }
+                                        'code': 1,
+                                        'data': [
+                                            {
+                                                'classify': consts.HMH_CLASSIFY,
+                                                'data': hmh_predict(pdf_handler.pdf_info)
+                                            }
+                                        ]
+                                    }
                            else:
-                                img_contract_res = {
-                                    'code': 1,
-                                    'data': [
-                                        {
-                                            'classify': int(classify_1_str),
-                                        }
-                                    ]
-                                }
+                                if page_key in page_res:
+                                    img_contract_res = {
+                                        'code': 1,
+                                        'data': [
+                                            {
+                                                'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY),
+                                                'data': page_res[page_key]
+                                            }
+                                        ]
+                                    }
+                                else:
+                                    img_contract_res = {
+                                        'code': 1,
+                                        'data': [
+                                            {
+                                                'classify': int(classify_1_str),
+                                            }
+                                        ]
+                                    }
                            contract_res[img_path_tmp] = img_contract_res

                        with lock:
--- a/src/apps/doc/named_enum.py
View file @e0d31a2
+++ b/src/apps/doc/named_enum.py
View file @e0d31a2
@@ -36,6 +36,7 @@ class RequestTrigger(NamedEnum):
    DOCUPLOAD = (3, 'Document Upload')
    SUBMITING = (4, 'Submiting')
    UPLOADING = (5, 'Uploading')
+    OVP = (6, 'OVP')


 class FailureReason(NamedEnum):
--- a/src/apps/doc/ocr/wb.py
View file @e0d31a2
+++ b/src/apps/doc/ocr/wb.py
View file @e0d31a2
@@ -780,10 +780,12 @@ class BSWorkbook(Workbook):
            if field_str is not None:
                count_list.append((field_str, count))

-    def contract_rebuild(self, contract_result_dict):
+    def contract_rebuild(self, contract_result_dict, is_ca=False):
        for classify, contract_result in contract_result_dict.items():
            if len(contract_result) == 0:
                continue
+            if is_ca and classify not in consts.FSM_CONTRACT_CLASSIFY_SET:
+                continue
            ws = self.create_sheet(consts.CONTRACT_MAP.get(classify))
            for i in range(30):
                if str(i) in contract_result:
@@ -906,6 +908,7 @@ class BSWorkbook(Workbook):
        else:
            self.bs_rebuild(bs_summary, res_count_tuple, metadata)
            self.license_rebuild(license_summary, document_scheme, count_list)
+            self.contract_rebuild(contract_result, True)
        self.move_res_sheet()
        self.remove_base_sheet()
        return count_list
--- a/src/apps/doc/views.py
View file @e0d31a2
+++ b/src/apps/doc/views.py
View file @e0d31a2
@@ -602,13 +602,22 @@ class UploadDocView(GenericView, DocHandler):
        is_zip = False

        classify_1 = 0
-        # 电子合同
-        if data_source == consts.DATA_SOURCE_LIST[-1] and document_scheme == consts.DOC_SCHEME_LIST[1]:
-            for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix):
+        # 电子合同 Econtract or OVP(FSM)
+        if data_source == consts.DATA_SOURCE_LIST[2] or data_source == consts.DATA_SOURCE_LIST[3]:  
+            if document_scheme == consts.DOC_SCHEME_LIST[1]:
+                for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix):
+                    if keyword in document_name:
+                        classify_1 = classify_1_tmp
+                        break
+        # FSM合同：WEP/MSI/SC
+        elif data_source == consts.DATA_SOURCE_LIST[0] and document_scheme == consts.DOC_SCHEME_LIST[0]:
+            for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix):
                if keyword in document_name:
                    classify_1 = classify_1_tmp
-                    break
-        elif document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
+                    break 
+
+
+        if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
                or document_name.endswith('.RAR'):
            is_zip = True

--- a/src/common/electronic_afc_contract/afc_contract_ocr.py
View file @e0d31a2
+++ b/src/common/electronic_afc_contract/afc_contract_ocr.py
View file @e0d31a2
@@ -6,6 +6,7 @@
 # @Description   :

 from .get_char import Finder
+from .get_char_fsm import Finder as FSMFinder
 import numpy as np


@@ -23,7 +24,7 @@ def extract_info(ocr_results):
    return {'page_1': {'合同编号': contract_no}}


-def predict(pdf_info, is_qrs=False):
+def predict(pdf_info, is_qrs=False, is_fsm=False):
    ocr_results = {}
    for pno in pdf_info:
        ocr_results[pno] = {}
@@ -50,7 +51,10 @@ def predict(pdf_info, is_qrs=False):
        results = extract_info(ocr_results)
    else:
        # 输入是整个 PDF 中的信息
-        f = Finder(pdf_info, ocr_results=ocr_results)
+        if is_fsm:
+            f = FSMFinder(pdf_info, ocr_results=ocr_results) 
+        else:
+            f = Finder(pdf_info, ocr_results=ocr_results)
        results = f.get_info()
    return results

--- a/src/common/electronic_afc_contract/get_char_fsm.py 0 → 100644
View file @e0d31a2
+++ b/src/common/electronic_afc_contract/get_char_fsm.py 0 → 100644
View file @e0d31a2
+import re
+import numpy as np
+from fuzzywuzzy import fuzz
+from shapely.geometry import Polygon
+
+
+class Finder:
+
+    def __init__(self, pdf_info, ocr_results):
+        self.pdf_info = pdf_info
+        self.ocr_results = ocr_results
+        self.is_asp = False
+        self.item = {"words": None,
+                     "position": None,
+                    }
+
+    def gen_init_result(self, is_asp):
+        # 格式化算法输出
+        self.init_result = {"page_1": {"合同编号": self.item,
+                                       "所购车辆价格": self.item,
+                                       "车架号": self.item,
+                                       "贷款本金金额": {"大写": self.item,
+                                                      "小写": self.item,
+                                                      "车辆贷款本金金额": self.item,
+                                                      "附加产品融资贷款本金总金额": self.item,
+                                                     },
+                                       "贷款期限": self.item,
+                                       "附加产品融资贷款本金总金额明细": self.item,
+                                       "借款人签字及时间": self.item,
+                                      },
+                            "page_2": {"合同编号": self.item,
+                                       "借款人及抵押人": {"name": self.item,
+                                                       "id": self.item,
+                                                       },
+                                       "共同借款人及共同抵押人": {"name": self.item,
+                                                              "id": self.item,
+                                                              },
+                                       "保证人1": {"name": self.item,
+                                                  "id": self.item,
+                                                  },
+                                       "保证人2": {"name": self.item,
+                                                  "id": self.item,
+                                                  },
+                                       "所购车辆价格": self.item,
+                                       "车架号": self.item,
+                                       "经销商": self.item,
+                                       "贷款本金金额": {"大写": self.item,
+                                                      "小写": self.item,
+                                                      "车辆贷款本金金额": self.item,
+                                                      "附加产品融资贷款本金总金额": self.item,
+                                                      },
+                                       "贷款期限": self.item,
+                                       "标准利率": self.item,
+                                       "借款人收款账户": {"账号": self.item,
+                                                        "户名": self.item,
+                                                        "开户行": self.item,
+                                                        },
+                                       "还款账户": {"账号": self.item,
+                                                   "户名": self.item,
+                                                   "开户行": self.item,
+                                                  },
+                                      },
+                            "page_3": {"合同编号": self.item,
+                                       "还款计划表": self.item,
+                                      },
+                            "page_4": {"合同编号": self.item,
+                                       "附加产品融资贷款本金总金额明细": self.item,
+                                      },
+                            "page_5": {"合同编号": self.item,
+                                      },
+                            "page_6": {"合同编号": self.item,
+                                      },
+                            }
+        self.init_result["page_7"] = {"合同编号": self.item,
+                                        }
+        self.init_result["page_8"] = {"合同编号": self.item,
+                                        "主借人签字": {"签字": self.item,
+                                                    "日期": self.item,
+                                                    },
+                                        "共借人签字": {"签字": self.item,
+                                                    "日期": self.item,
+                                                    },
+                                        "保证人1签字": {"签字": self.item,
+                                                    "日期": self.item,
+                                                    },
+                                        "保证人2签字": {"签字": self.item,
+                                                    "日期": self.item,
+                                                    },
+                                        "见证人签字": {"签字": self.item,
+                                                    "日期": self.item,
+                                                    },
+                                        }
+
+    def get_top_iou(self, poly, ocr_result):
+        """传入一个多边形, 找到与之最匹配的多边形
+        
+        Args:
+            poly (TYPE): Description
+        """
+        iou_list = []
+        for key in ocr_result:
+            bbox, text = ocr_result[key]
+            g = Polygon(np.array(bbox).reshape((-1, 2)))
+            p = Polygon(np.array(poly).reshape((-1, 2)))
+            if not g.is_valid or not p.is_valid:
+                continue
+            inter = Polygon(g).intersection(Polygon(p)).area
+            union = g.area + p.area - inter
+            iou = inter/union
+            iou_list.append([iou, key])
+        if len(iou_list) == 0:
+            return -1, -1
+        top_iou = sorted(iou_list, key=lambda x: x[0])[-1]
+        return top_iou
+
+    def poly_to_rectangle(self, poly):
+        xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly
+        bbox = [xmin, ymin, xmax, ymax]
+        return bbox
+
+    def get_contract_no(self, page_num):
+        """传入页码,查看该页码右上角的编号
+        
+        Args:
+            page_num (string): 
+        
+        Returns:
+            sting: 
+        """
+        contract_no = self.item.copy()
+        # contract_no['words'] = ''
+        # contract_no['position'] = [-1, -1, -1, -1]
+        # 只看第一页
+        for key in self.ocr_results[page_num]:
+            bbox, text = self.ocr_results[page_num][key]
+            if '合同编号:' in text:
+                words = text.split(':')[-1]
+                location = self.poly_to_rectangle(bbox)
+                contract_no['words'] = words
+                contract_no['position'] = location
+        return contract_no
+
+    def get_vehicle_price(self, page_num='0'):
+        vehicle_price = self.item.copy()
+        # vehicle_price['words'] = ''
+        # vehicle_price['position'] = [-1, -1, -1, -1]
+        for key in self.ocr_results[page_num]:
+            bbox, text = self.ocr_results[page_num][key]
+            if '所购车辆价格为人民币' in text:
+                words = text.split('币')[-1]
+                location = self.poly_to_rectangle(bbox)
+                vehicle_price['words'] = words
+                vehicle_price['position'] = location
+        return vehicle_price
+
+    def get_vin(self, page_num='0'):
+        vin = self.item.copy()
+        # vin['words'] = ''
+        # vin['position'] = [-1, -1, -1, -1]
+        for key in self.ocr_results[page_num]:
+            bbox, text = self.ocr_results[page_num][key]
+            if '车架号:' in text:
+                words = text.split(':')[-1]
+                location = self.poly_to_rectangle(bbox)
+                vin['words'] = words
+                vin['position'] = location
+        return vin
+
+    def get_loan_principal(self, page_num='0'):
+        chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾',
+                            '佰', '仟', '万', '亿', '元', '角', '分', '零', '整']
+        upper = self.item.copy()
+        lower = self.item.copy()
+        asp_1 = self.item.copy()
+        asp_2 = self.item.copy()
+        anchor_bbox = None
+        for block in self.pdf_info[page_num]['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if fuzz.ratio(''.join(chinese_keywords), text) > 15:
+                        text = text.split('：')[-1].strip()
+                        upper['position'] = bbox
+                        upper['words'] = text
+                    if '小写：¥' in text:
+                        words = text.split('¥')[-1].strip()
+                        lower['position'] = bbox
+                        lower['words'] = words
+                    if '附加产品融资贷款本金总金额' == text:
+                        anchor_bbox = bbox
+        if anchor_bbox:
+            for block in self.pdf_info[page_num]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币：小写：' in text:
+                            words = re.findall(r'人民币：小写：\[(.*)\]', text)[0]
+                            asp_1['position'] = bbox
+                            asp_1['words'] = words
+                        if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币：小写：' in text:
+                            words = re.findall(r'人民币：小写：\[(.*)\]', text)[0]
+                            asp_2['position'] = bbox
+                            asp_2['words'] = words
+        return upper, lower, asp_1, asp_2
+
+    def get_loan_term(self, page_num='0'):
+        loan_term = self.item.copy()
+        all_text = ''
+        for block in self.pdf_info[page_num]['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    all_text += text
+        matchs = re.search(r'贷款期限(\d+)个月', all_text)
+        if matchs:
+            words = matchs.group(1)
+            for block in self.pdf_info[page_num]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if f'{words}个月' in text:
+                            loan_term['position'] = bbox
+                            loan_term['words'] = words
+        return loan_term
+
+    def get_standard_rate(self, page_num='0'):
+        standard_rate = self.item.copy()
+        for block in self.pdf_info[page_num]['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    matchs = re.search(r'本合同当期的标准利率为(\S+)%/年', text)
+                    if matchs:
+                        standard_rate['position'] = bbox
+                        standard_rate['words'] = matchs.group(1)
+        return standard_rate
+
+    def mergelist(self, text_list):
+        pattern = re.compile("[^\u4e00-\u9fa5]")        # 匹配不是中文的其他字符
+        mergeindex = -1
+        for index, i in enumerate(text_list):
+            if '所购' in i and len(pattern.sub('', pattern.sub('', text_list[index+1]))) != 0:
+            # if '所购' in i and '.00' not in text_list[index+1]:
+                mergeindex = index
+        if mergeindex == -1:
+            return text_list
+        else:
+            new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex+1]] + text_list[mergeindex+2:]
+            return self.mergelist(new_text_list)   
+
+    def get_asp_details(self, page_num):
+        asp_details_table_term = self.item.copy()
+
+        asp_details_table = [['附加产品融资贷款本金总金额及贷款利率明细'], ['项目1', '用途总金额2', '贷款本金3']]
+
+        bbox_xm = None
+        bbox_ytzje = None
+        bbox_dkbj = None
+        bbox_total = None
+        for key in self.ocr_results[page_num]:
+            bbox, text = self.ocr_results[page_num][key]
+            if text == '项目1':
+                bbox_xm = bbox
+            if text == '用途总金额2':
+                bbox_ytzje = bbox
+            if text == '贷款本金3':
+                bbox_dkbj = bbox
+            if text in ['附加产品融资贷款本', '附加产品融资贷款本金', '附加产品融资贷']:
+                bbox_total = bbox
+
+        if bbox_xm:
+            for i in range(10):
+                rh = abs(bbox_xm[1]-bbox_xm[-1])
+                anchor = np.array(bbox_xm).reshape((-1 ,2))
+                anchor[:, 1] += int(rh*1.4)
+                _iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num])
+                if _iou > 0:
+                    bbox, xm_text = self.ocr_results[page_num][_key]
+                    bbox_xm = bbox
+                    # 解决项目内容是两行的问题
+                    if not '所购' in xm_text:
+                        line = asp_details_table[-1]
+                        line[0] += xm_text
+                        asp_details_table[-1] = line
+                        continue
+                    # print(xm_text)
+                    anchor_1 = [bbox_ytzje[0], bbox[1], bbox_ytzje[2], bbox[3],
+                                bbox_ytzje[4], bbox[5], bbox_ytzje[6], bbox[7]]
+                    _iou, _key = self.get_top_iou(poly=anchor_1, ocr_result=self.ocr_results[page_num])
+                    bbox, ytzje_text = self.ocr_results[page_num][_key]
+                    # print(ytzje_text)
+                    anchor_2 = [bbox_dkbj[0], bbox[1], bbox_dkbj[2], bbox[3],
+                                bbox_dkbj[4], bbox[5], bbox_dkbj[6], bbox[7]]
+                    _iou, _key = self.get_top_iou(poly=anchor_2, ocr_result=self.ocr_results[page_num])
+                    bbox, dkbj_text = self.ocr_results[page_num][_key]
+                    # print(dkbj_text)
+                    if xm_text == ytzje_text:
+                        xm_text, ytzje_text = xm_text.split(' ')
+                    line = [xm_text, ytzje_text, dkbj_text]
+                    asp_details_table.append(line)
+                else:
+                    break
+        
+        if bbox_total:
+            anchor = [bbox_dkbj[0], bbox_total[1], bbox_dkbj[2], bbox_total[3],
+                    bbox_dkbj[4], bbox_total[5], bbox_dkbj[6], bbox_total[7]]
+            _iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num])
+            bbox, total_text = self.ocr_results[page_num][_key]
+            asp_details_table.append(['附加产品融资贷款本金总金额:', '', total_text])
+        asp_details_table_term['words'] = asp_details_table
+
+        return asp_details_table_term
+
+    def get_signature(self):
+        signature = self.item.copy()
+
+        for block in self.pdf_info['0']['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if '签署日期' in text:
+                        words = text
+                        signature['words'] = words
+                        signature['position'] = bbox
+        return signature
+
+    def get_somebody(self, top, bottom):
+        # 指定上下边界后,返回上下边界内的客户信息
+        _name = self.item.copy()
+        _id = self.item.copy()
+        # 只看第一页，先划定上下边界
+        y_top = 0
+        y_bottom = 0
+        for block in self.pdf_info['1']['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if top in text:
+                        y_top = bbox[3]
+                    if bottom in text:
+                        y_bottom = bbox[3]
+        for block in self.pdf_info['1']['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if y_top < bbox[3] < y_bottom:
+                        # print(top, bottom, text)
+                        if '姓名/名称' in text:
+                            words = text.split('：')[-1]
+                            _name['position'] = bbox
+                            _name['words'] = words
+                        if '自然人身份证件号码/法人执照号码' in text:
+                            words = text.split('：')[-1]
+                            _id['position'] = bbox
+                            _id['words'] = words
+        return _name, _id
+
+    def get_seller(self):
+        seller = self.item.copy()
+        # 先找到 key
+        anchor_bbox = None
+        for block in self.pdf_info['1']['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if text in ['经销商', '车辆销售方']:
+                        anchor_bbox = bbox
+        # 当找到了 key, 则根据 key 去匹配 value
+        if anchor_bbox:
+            half_width = self.pdf_info['1']['width'] * 0.5
+            for block in self.pdf_info['1']['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \
+                            anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]:
+                            seller['position'] = bbox
+                            seller['words'] = text
+        return seller
+
+    def get_borrower_collection_account(self):
+        account = self.item.copy()
+        account_name = self.item.copy()
+        account_bank = self.item.copy()
+        all_text = ''
+        for block in self.pdf_info['1']['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    all_text += text
+        # 首先确定账户信息是哪种,我们只输出非另行通知的格式
+        if '借款人收款账户' in all_text:
+            all_text = all_text.replace('　', '').replace(' ', '')
+            matchs_1 = re.findall(r'账号：(.*?)户名', all_text)
+            if matchs_1:
+                words = matchs_1[0]
+                for block in self.pdf_info['1']['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            if f'{words}' in text:
+                                account['position'] = bbox
+                                account['words'] = words
+            matchs_2 = re.findall(r'户名：(.*?)开户行', all_text)
+            if matchs_2:
+                words = matchs_2[0]
+                for block in self.pdf_info['1']['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            if f'{words}' in text:
+                                account_name['position'] = bbox
+                                account_name['words'] = words
+            matchs_3 = re.findall(r'开户行：(.*?)借款人', all_text)
+            if matchs_3:
+                words = matchs_3[0]
+                for block in self.pdf_info['1']['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            if f'{words}' in text:
+                                account_bank['position'] = bbox
+                                account_bank['words'] = words
+        return account, account_name, account_bank
+
+    def get_payback_account(self):
+        account = self.item.copy()
+        account_name = self.item.copy()
+        account_bank = self.item.copy()
+        all_text = ''
+        for block in self.pdf_info['1']['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    all_text += text
+        # 首先确定账户信息是哪种,我们只输出非另行通知的格式
+        if '(13) 还款账户' in all_text:
+            all_text = all_text.split('(13) 还款账户')[-1]
+            all_text = all_text.replace('　', '').replace(' ', '')
+            matchs_1 = re.findall(r'账号：(.*?)户名', all_text)
+            if matchs_1:
+                words = matchs_1[0]
+                for block in self.pdf_info['1']['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            if f'{words}' in text:
+                                account['position'] = bbox
+                                account['words'] = words
+            matchs_2 = re.findall(r'户名：(.*?)开户行', all_text)
+            if matchs_2:
+                words = matchs_2[0]
+                for block in self.pdf_info['1']['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            if f'{words}' in text:
+                                account_name['position'] = bbox
+                                account_name['words'] = words
+            matchs_3 = re.findall(r'开户行：(.*?)；', all_text)
+            if matchs_3:
+                words = matchs_3[0]
+                for block in self.pdf_info['1']['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            if f'开户行：{words}；' in text.replace('　', ''):
+                                account_bank['position'] = bbox
+                                account_bank['words'] = words
+        return account, account_name, account_bank
+
+    def get_repayment_schedule(self):
+        repayment_schedule = self.item.copy()
+        # 只看第二页
+        repayment_schedule_table = []
+        repayment_schedule_text_list = []
+        table = False
+        for block in self.pdf_info['2']['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if '序号' == text:
+                        table = True
+                    if '以上表格中所列的序号并非还款期数' in text:
+                        table = False
+                    if table == True:
+                        repayment_schedule_text_list.append(text)
+
+        for i in range(len(repayment_schedule_text_list)//5):
+
+            line = []
+            # 5表示5列的意思
+            for j in range(5):
+                line.append(repayment_schedule_text_list[i*5+j])
+
+            if str(i+1) == line[1]:
+                break
+
+            repayment_schedule_table.append(line)
+
+        if len(repayment_schedule_table) > 0:
+            repayment_schedule['words'] = repayment_schedule_table
+        return repayment_schedule
+
+    def get_signature_role_1(self):
+        signature_role_1 = self.init_item.copy()
+        # 先定位签字区域
+        texts = []
+        boxes = []
+        page_num = None
+        position = None
+        words = None
+        region = False
+        for i in list(self.pdf_info.keys()):
+            for block in self.pdf_info[i]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '借款人(抵押人)' in text:
+                            region = True
+                        if '日期' in text:
+                            region = False
+                        if region == True:
+                            page_num = i
+                            texts.append(text)
+                            boxes.append(bbox)
+        if len(texts) > 4:
+            words = '有'
+        else:
+            words = '无'
+        boxes = np.array(boxes).reshape((-1, 2))
+        position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
+        signature_role_1['page_num'] = page_num
+        signature_role_1['position'] = position
+        signature_role_1['words'] = words
+        return signature_role_1
+
+    def get_signature_role_2(self):
+        signature_role_2 = self.init_item.copy()
+        # 先定位签字区域
+        texts = []
+        boxes = []
+        page_num = None
+        position = None
+        words = None
+        region = False
+        for i in list(self.pdf_info.keys()):
+            for block in self.pdf_info[i]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '共同借款人(共同抵押人)' in text:
+                            region = True
+                        if '日期' in text:
+                            region = False
+                        if region == True:
+                            page_num = i
+                            texts.append(text)
+                            boxes.append(bbox)
+        if len(texts) > 4:
+            words = '有'
+        else:
+            words = '无'
+        boxes = np.array(boxes).reshape((-1, 2))
+        position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
+        signature_role_2['page_num'] = page_num
+        signature_role_2['position'] = position
+        signature_role_2['words'] = words
+        return signature_role_2
+
+    def get_signature_role_3(self):
+        signature_role_3 = self.init_item.copy()
+        # 先定位签字区域
+        texts = []
+        boxes = []
+        page_num = None
+        position = None
+        words = None
+        region = False
+        for i in list(self.pdf_info.keys()):
+            for block in self.pdf_info[i]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '保证人1' in text and int(i) != 0:
+                            region = True
+                        if '日期' in text:
+                            region = False
+                        if region == True:
+                            page_num = i
+                            texts.append(text)
+                            boxes.append(bbox)
+        if len(texts) > 4:
+            words = '有'
+        else:
+            words = '无'
+        boxes = np.array(boxes).reshape((-1, 2))
+        position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
+        signature_role_3['page_num'] = page_num
+        signature_role_3['position'] = position
+        signature_role_3['words'] = words
+        return signature_role_3
+
+    def get_signature_role_4(self):
+        signature_role_4 = self.init_item.copy()
+        # 先定位签字区域
+        texts = []
+        boxes = []
+        page_num = None
+        position = None
+        words = None
+        region = False
+        for i in list(self.pdf_info.keys()):
+            for block in self.pdf_info[i]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '保证人2' in text and int(i) != 0:
+                            region = True
+                        if '日期' in text:
+                            region = False
+                        if region == True:
+                            page_num = i
+                            texts.append(text)
+                            boxes.append(bbox)
+        if len(texts) > 4:
+            words = '有'
+        else:
+            words = '无'
+        boxes = np.array(boxes).reshape((-1, 2))
+        position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
+        signature_role_4['page_num'] = page_num
+        signature_role_4['position'] = position
+        signature_role_4['words'] = words
+        return signature_role_4
+
+    def get_signature_role_5(self):
+        signature_role_5 = self.init_item.copy()
+        # 先定位签字区域
+        texts = []
+        boxes = []
+        page_num = None
+        position = None
+        words = None
+        region = False
+        for i in list(self.pdf_info.keys()):
+            for block in self.pdf_info[i]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '见证人签字' in text and int(i) != 0:
+                            region = True
+                        if '年' in text:
+                            region = False
+                        if region == True:
+                            page_num = i
+                            texts.append(text)
+                            boxes.append(bbox)
+        print(texts)
+        if len(texts) > 4:
+            words = '有'
+        else:
+            words = '无'
+        boxes = np.array(boxes).reshape((-1, 2))
+        position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
+        signature_role_5['page_num'] = page_num
+        signature_role_5['position'] = position
+        signature_role_5['words'] = words
+        return signature_role_5
+
+    def get_last_page_signature(self, page_num, top, bottom):
+        signature_name = self.item.copy()
+        signature_date = self.item.copy()
+        anchor_top = None
+        anchor_bottom = None
+        for block in self.pdf_info[page_num]['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if top in text:
+                        anchor_top = bbox[1]
+                    if bottom in text:
+                        anchor_bottom = bbox[1]
+        # print(top, anchor_top, anchor_bottom)
+        if anchor_top is not None and anchor_bottom is not None:
+            for block in self.pdf_info[page_num]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
+                            name = text.split(' ')[0]
+                            date = text.split(':')[-1]
+                            signature_name['words'] = name
+                            signature_name['position'] = bbox
+                            signature_date['words'] = date
+                            signature_date['position'] = bbox
+        return signature_name, signature_date
+
+    def get_info(self):
+        """
+            block['type'] == 0 : 表示该元素为图片
+        
+        Returns:
+            dict: Description
+        """
+
+        # 先判断是否为 ASP 产品
+        # 只看第一页，判断是否有 '附加产品融资贷款本金总金额' 这一句话，若有则为 ASP 产品
+        # print(self.pdf_info['0']['blocks'])
+        # for block in self.pdf_info['0']['blocks']:
+        #     if block['type'] != 0:
+        #         continue
+        #     for line in block['lines']:
+        #         for span in line['spans']:
+        #             bbox, text = span['bbox'], span['text']
+        #             if '附加产品融资贷款本金总金额' == text:
+        #                 self.is_asp = True
+        for key in self.ocr_results['0']:
+            bbox, text = self.ocr_results['0'][key]
+            if '附加产品融资贷款本金总金额' in text:
+                self.is_asp = True
+
+        self.gen_init_result(self.is_asp)
+
+        if len(list(self.ocr_results.keys())) <= 8:             # 8.5 版本客户提供的样本出现串页的情况，暂时无法识别
+            # Page 1
+            # 找合同编号
+            contract_no = self.get_contract_no(page_num='0')
+            # print(contract_no)
+            self.init_result['page_1']['合同编号'] = contract_no
+            # 所购车辆价格
+            vehicle_price = self.get_vehicle_price()
+            # print(vehicle_price)
+            self.init_result['page_1']['所购车辆价格'] = vehicle_price
+            # 车架号
+            vin = self.get_vin()
+            # print(vin)
+            self.init_result['page_1']['车架号'] = vin
+            # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
+            upper, lower, asp_1, asp_2 = self.get_loan_principal()
+            # print(upper, lower, asp_1, asp_2)
+            self.init_result['page_1']['贷款本金金额']['大写'] = upper
+            self.init_result['page_1']['贷款本金金额']['小写'] = lower
+            self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1
+            self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
+            # 贷款期限
+            loan_term = self.get_loan_term()
+            # print(loan_term)
+            self.init_result['page_1']['贷款期限'] = loan_term
+            # 附加产品融资贷款本金总金额明细（ASP-表格）
+            asp_details_table = self.get_asp_details(page_num='0')
+            # print(asp_details_table)
+            self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table
+            # 借款人签字及时间
+            signature = self.get_signature()
+            # print(signature)
+            self.init_result['page_1']['借款人签字及时间'] = signature
+            #######################################
+            # Page 2
+            # 找合同编号
+            contract_no = self.get_contract_no(page_num='0')
+            # print(contract_no)
+            self.init_result['page_2']['合同编号'] = contract_no
+            # 找借款人及抵押人(地址字段原本有空格)
+            borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人：', bottom='共同借款人：')
+            # 这是为了同时兼容 8.1 版本
+            if borrower_name['words'] == None:
+                borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人：', bottom='共同借款人及共同抵押人：')
+            # 这是为了兼容车贷分离版本
+            if borrower_name['words'] == None:
+                borrower_name, borrower_id = self.get_somebody(top='借款人：', bottom='共同借款人及抵押人：')
+            # print(borrower_name, borrower_id)
+            self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name
+            self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id
+            # 找共同借款人及共同抵押人
+            co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人：', bottom='保证人1：')
+            # print(co_borrower_name, co_borrower_id)
+            self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name
+            self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id
+            # 保证人1
+            first_guarantor_name, first_guarantor_id = self.get_somebody(top='保证人1：', bottom='保证人2：')
+            self.init_result['page_2']['保证人1']['name'] = first_guarantor_name
+            self.init_result['page_2']['保证人1']['id'] = first_guarantor_id
+            # 保证人2
+            second_guarantor_name, second_guarantor_id = self.get_somebody(top='保证人2：', bottom='第一章')
+            self.init_result['page_2']['保证人2']['name'] = second_guarantor_name
+            self.init_result['page_2']['保证人2']['id'] = second_guarantor_id
+            # 所购车辆价格
+            vehicle_price = self.get_vehicle_price(page_num='1')
+            # print(vehicle_price)
+            self.init_result['page_2']['所购车辆价格'] = vehicle_price
+            # 车架号
+            vin = self.get_vin(page_num='1')
+            # print(vin)
+            self.init_result['page_2']['车架号'] = vin
+            # 经销商
+            seller = self.get_seller()
+            # print(seller)
+            self.init_result['page_2']['经销商'] = seller
+            # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
+            upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1')
+            # print(upper, lower, asp_1, asp_2)
+            self.init_result['page_2']['贷款本金金额']['大写'] = upper
+            self.init_result['page_2']['贷款本金金额']['小写'] = lower
+            self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1
+            self.init_result['page_2']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
+            # 贷款期限
+            loan_term = self.get_loan_term(page_num='1')
+            # print(loan_term)
+            self.init_result['page_2']['贷款期限'] = loan_term
+            # 本合同当期的标准利率
+            standard_rate = self.get_standard_rate(page_num='1')
+            # print(standard_rate)
+            self.init_result['page_2']['标准利率'] = standard_rate
+            # 202212 release 新增借款人收款账户
+            account, account_name, account_bank = self.get_borrower_collection_account()
+            # print(account, account_name, account_bank)
+            self.init_result['page_2']['借款人收款账户']['账号'] = account
+            self.init_result['page_2']['借款人收款账户']['户名'] = account_name
+            self.init_result['page_2']['借款人收款账户']['开户行'] = account_bank
+            # 还款账户
+            account, account_name, account_bank = self.get_payback_account()
+            # print(account, account_name, account_bank)
+            self.init_result['page_2']['还款账户']['账号'] = account
+            self.init_result['page_2']['还款账户']['户名'] = account_name
+            self.init_result['page_2']['还款账户']['开户行'] = account_bank
+            #######################################
+            # Page 3
+            # 找合同编号
+            contract_no = self.get_contract_no(page_num='2')
+            self.init_result['page_3']['合同编号'] = contract_no
+            # 还款计划表（表格）
+            repayment_schedule_table = self.get_repayment_schedule()
+            # print(repayment_schedule_table)
+            self.init_result['page_3']['还款计划表'] = repayment_schedule_table
+            #######################################
+            # Page 4
+            # 找合同编号
+            contract_no = self.get_contract_no(page_num='3')
+            # print(contract_no)
+            self.init_result['page_4']['合同编号'] = contract_no
+            # 附加产品融资贷款本金总金额明细（ASP-表格）
+            asp_details_table = self.get_asp_details(page_num='3')
+            # print(asp_details_table)
+            self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table
+            #######################################
+            # Page 5
+            # 找合同编号
+            contract_no = self.get_contract_no(page_num='4')
+            # print(contract_no)
+            self.init_result['page_5']['合同编号'] = contract_no
+            #######################################
+            # Page 6
+            # 找合同编号
+            contract_no = self.get_contract_no(page_num='5')
+            # print(contract_no)
+            self.init_result['page_6']['合同编号'] = contract_no
+            # Page 7
+            # 找合同编号
+            contract_no = self.get_contract_no(page_num='6')
+            self.init_result['page_7']['合同编号'] = contract_no
+            # Page 8
+            # 找合同编号
+            contract_no = self.get_contract_no(page_num='7')
+            self.init_result['page_8']['合同编号'] = contract_no
+            signature_name, signature_date = self.get_last_page_signature(page_num='7',
+                                                top='合同编号', bottom='共同借款人')
+            if signature_name['words'] == None:
+                signature_name, signature_date = self.get_last_page_signature(page_num='7',
+                                                top='合同编号', bottom='共同借款人（抵押人）')
+            # print(signature_name, signature_date)
+            self.init_result['page_8']['主借人签字']['签字'] = signature_name
+            self.init_result['page_8']['主借人签字']['日期'] = signature_date
+            signature_name, signature_date = self.get_last_page_signature(page_num='7',
+                                                top='共同借款人', bottom='保证人1')
+            if signature_name['words'] == None:
+                signature_name, signature_date = self.get_last_page_signature(page_num='7',
+                                                top='共同借款人（抵押人）', bottom='保证人1')
+            # print(signature_name, signature_date)
+            self.init_result['page_8']['共借人签字']['签字'] = signature_name
+            self.init_result['page_8']['共借人签字']['日期'] = signature_date
+            signature_name, signature_date = self.get_last_page_signature(page_num='7',
+                                                top='保证人1', bottom='保证人2')
+            self.init_result['page_8']['保证人1签字']['签字'] = signature_name
+            self.init_result['page_8']['保证人1签字']['日期'] = signature_date
+            signature_name, signature_date = self.get_last_page_signature(page_num='7',
+                                                top='保证人2', bottom='在本人面前亲笔签署本合同')
+            self.init_result['page_8']['保证人2签字']['签字'] = signature_name
+            self.init_result['page_8']['保证人2签字']['日期'] = signature_date
+            signature_name, signature_date = self.get_last_page_signature(page_num='7',
+                                                top='在本人面前亲笔签署本合同', bottom='以下无正文')
+            # print(signature_name, signature_date)
+            self.init_result['page_8']['见证人签字']['签字'] = signature_name
+            self.init_result['page_8']['见证人签字']['日期'] = signature_date
+
+        # 重新定制输出
+        new_results = {"is_asp": self.is_asp,
+                       "page_info": self.init_result
+        }
+        return new_results
\ No newline at end of file
--- a/src/common/electronic_hil_contract/get_char_fsm.py 0 → 100644
View file @e0d31a2
+++ b/src/common/electronic_hil_contract/get_char_fsm.py 0 → 100644
View file @e0d31a2
+import re
+import numpy as np
+from fuzzywuzzy import fuzz
+from shapely.geometry import Polygon
+
+def caculate_iou(g, p):
+    g = Polygon(np.array(g).reshape((-1, 2)))
+    p = Polygon(np.array(p).reshape((-1, 2)))
+    inter = Polygon(g).intersection(Polygon(p)).area
+    union = g.area + p.area - inter
+    return inter/union
+
+def get_table_info(bbox_1, bbox_2, ocr_result):
+    anchor = [bbox_2[0], bbox_1[1], bbox_2[2], bbox_1[3],
+            bbox_2[4], bbox_1[5], bbox_2[6], bbox_1[7]]
+    table_info = ''
+    for span in ocr_result:
+        iou = caculate_iou(anchor, span[0])
+        if iou > 0:
+            table_info = span[1]
+    return table_info
+
+class Finder:
+
+    def __init__(self, pdf_info):
+        self.pdf_info = pdf_info
+        self.item = {"words": None,
+                     "page": None,
+                     "position": None,
+                    }
+        # 格式化算法输出
+        self.init_result = {"合同编号": self.item,
+                            "承租人-姓名": self.item,
+                            "承租人-证件号码": self.item,
+                            "承租人-法定代表人或授权代表": self.item,
+
+                            "共同承租人-姓名": self.item,
+                            "共同承租人-证件号码": self.item,
+                            "共同承租人-法定代表人或授权代表": self.item,
+
+                            "保证人1-姓名": self.item,
+                            "保证人1-证件号码": self.item,
+                            "保证人1-法定代表人或授权代表": self.item,
+
+                            "保证人2-姓名": self.item,
+                            "保证人2-证件号码": self.item,
+                            "保证人2-法定代表人或授权代表": self.item,
+                            "保证人3-姓名": self.item,
+                            "保证人3-证件号码": self.item,
+                            "保证人3-法定代表人或授权代表": self.item,
+                            "合同编号（正文）": self.item,
+                            "车辆识别代码": self.item,
+                            "车辆卖方（经销商）": self.item,
+                            "车辆原始销售价格（《机动车销售统一发票》所列金额）": self.item,
+                            "车辆附加产品明细表": self.item,
+                            "融资成本总额": self.item,
+                            "租期": self.item,
+                            "付款计划表": self.item,
+                            "承租人收款账户-户名": self.item,
+                            "承租人收款账户-银行账号": self.item,
+                            "承租人收款账户-开户行": self.item,
+                            "承租人扣款账户-户名": self.item,
+                            "承租人扣款账户-银行账号": self.item,
+                            "承租人扣款账户-开户行": self.item,
+                            "签字页-承租人姓名": self.item,
+                            "签字页-承租人签章": self.item,
+
+                            "签字页-共同承租人姓名": self.item,
+                            "签字页-共同承租人签章": self.item,
+
+                            "签字页-保证人1姓名": self.item,
+                            "签字页-保证人1签章": self.item,
+
+                            "签字页-保证人2姓名": self.item,
+                            "签字页-保证人2签章": self.item,
+                            "签字页-保证人3姓名": self.item,
+                            "签字页-保证人3签章": self.item,
+        }
+
+        # 格式化输出 车辆处置协议 要是别的字段
+        self.init_result_1 = {"合同编号": self.item,
+                              "承租人-姓名": self.item,
+                              "承租人-证件号码": self.item,
+                              "销售经销商": self.item,
+                              "合同编号（正文）": self.item,
+                              "签字页-承租人姓名": self.item,
+                              "签字页-承租人证件号码": self.item,
+                              "签字页-承租人签章": self.item,
+                              "签字页-销售经销商": self.item,
+                              "签字页-销售经销商签章": self.item,
+        }
+
+        # 格式化输出 车辆租赁抵押合同
+        self.init_result_2 = {"合同编号": self.item,
+                              "合同编号（正文）": self.item,
+                              "抵押人姓名/名称": self.item,
+                              "抵押人证件号码": self.item,
+                              "抵押人配偶姓名/名称": self.item,
+                              "抵押人配偶证件号码": self.item,
+                              "车辆识别代码": self.item,
+                              "租金总额": self.item,
+                              "融资租赁期限": self.item,
+                              "签字页-抵押人姓名": self.item,
+                              "签字页-抵押人签章": self.item,
+                              "签字页-抵押人配偶姓名": self.item,
+                              "签字页-抵押人配偶签章": self.item,
+        }
+
+    def get_contract_no(self, page_num):
+        """传入页码,查看该页码右上角的编号
+        
+        Args:
+            page_num (string): 
+        
+        Returns:
+            sting: 
+        """
+        contract_no = self.item.copy()
+        # 只看第一页
+        for block in self.pdf_info[page_num]['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if '合同编号：' in text:
+                        words = text.split('：')[-1]
+                        contract_no['position'] = bbox
+                        contract_no['page'] = page_num
+                        contract_no['words'] = words
+        if contract_no['words'] == '':
+            for block in self.pdf_info[page_num]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if bbox[1] < contract_no['position'][3] and 'CH' in text:
+                            contract_no['position'] = bbox
+                            contract_no['page'] = page_num
+                            contract_no['words'] = text
+        return contract_no
+
+    def get_vehicle_price(self, page_num='0'):
+        vehicle_price = self.item.copy()
+        for block in self.pdf_info[page_num]['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if '所购车辆价格为人民币' in text:
+                        words = text.split('币')[-1]
+                        vehicle_price['position'] = bbox
+                        vehicle_price['words'] = words
+        return vehicle_price
+
+    def get_contract_no_one(self):
+        # 查找正文中的合同编号,有可能存在换行的情况
+        contract_no = self.item.copy()
+        for pno in self.pdf_info:
+            all_text = ''
+            for block in self.pdf_info[pno]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        all_text += text
+            all_text = all_text.replace(' ', '')
+            matchObj = re.search(r'（合同编号：\[(.*?)\]）', all_text)
+            if matchObj:
+                words = matchObj.group(1)
+                contract_no['position'] = None
+                contract_no['page'] = pno
+                # contract_no['words'] = words
+                contract_no['words'] = re.sub("\s", "", words).replace("）", "")
+                return contract_no
+
+            matchObj = re.search(r'编号为(.*?)的', all_text)
+            if matchObj:
+                words = matchObj.group(1).strip()
+                contract_no['position'] = None
+                contract_no['page'] = pno
+                # contract_no['words'] = words
+                contract_no['words'] = re.sub("\s", "", words).replace("）", "")
+                return contract_no
+
+            matchObj = re.search(r'编号为(.*?)）的', all_text)
+            if matchObj:
+                words = matchObj.group(1).strip()
+                contract_no['position'] = None
+                contract_no['page'] = pno
+                # contract_no['words'] = words
+                contract_no['words'] = re.sub("\s", "", words)
+        return contract_no
+
+    def get_key_value(self, key, page_num=None):
+        value = self.item.copy()
+        if page_num is not None:
+            pno = page_num
+            for block in self.pdf_info[pno]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if key in text:
+                            words = text.split('：')[-1].replace("。", "")
+                            value['position'] = bbox
+                            value['page'] = pno
+                            # value['words'] = words
+                            value['words'] = re.sub("\s", "", words)
+        else:
+            for pno in self.pdf_info:
+                for block in self.pdf_info[pno]['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            if key in text:
+                                # print(self.pdf_info[pno])
+                                words = text.split('：')[-1].replace("。", "")
+                                value['position'] = bbox
+                                value['page'] = pno
+                                # value['words'] = words
+                                value['words'] = re.sub("\s", "", words)
+        return value
+
+    def get_loan_principal(self, page_num='0'):
+        chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾',
+                            '佰', '仟', '万', '亿', '元', '角', '分', '零', '整']
+        upper = self.item.copy()
+        lower = self.item.copy()
+        asp_1 = self.item.copy()
+        asp_2 = self.item.copy()
+        anchor_bbox = None
+        for block in self.pdf_info[page_num]['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if fuzz.ratio(''.join(chinese_keywords), text) > 15:
+                        text = text.split('：')[-1].strip()
+                        upper['position'] = bbox
+                        upper['words'] = text
+                    if '小写：¥' in text:
+                        words = text.split('¥')[-1].strip()
+                        lower['position'] = bbox
+                        lower['words'] = words
+                    if '附加产品融资贷款本金总金额' == text:
+                        anchor_bbox = bbox
+        if anchor_bbox:
+            for block in self.pdf_info[page_num]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币：小写：' in text:
+                            words = re.findall(r'人民币：小写：\[(.*)\]', text)[0]
+                            asp_1['position'] = bbox
+                            asp_1['words'] = words
+                        if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币：小写：' in text:
+                            words = re.findall(r'人民币：小写：\[(.*)\]', text)[0]
+                            asp_2['position'] = bbox
+                            asp_2['words'] = words
+        return upper, lower, asp_1, asp_2
+
+    def get_loan_term(self, page_num='0'):
+        loan_term = self.item.copy()
+        all_text = ''
+        for block in self.pdf_info[page_num]['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    all_text += text
+        matchs = re.search(r'贷款期限(\d+)个月', all_text)
+        if matchs:
+            words = matchs.group(1)
+            for block in self.pdf_info[page_num]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if f'{words}个月' in text:
+                            loan_term['position'] = bbox
+                            loan_term['words'] = words
+        return loan_term
+
+    def get_asp_details(self, page_num):
+        asp_details_table_term = self.item.copy()
+
+        asp_details_table = []
+        asp_details_text_list = []
+        table = False
+        for block in self.pdf_info[page_num]['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if '附加产品融资贷款本金总金额明细' == text:
+                        table = True
+                    if '第二条' in text or '征信管理' in text:
+                        table = False
+                    if table == True:
+                        asp_details_text_list.append(text)
+
+        for i in range((len(asp_details_text_list)+2)//3):
+
+            line = []
+            if i == 0:
+                line = [asp_details_text_list[0]]
+            else:
+                for j in range(3):
+                    line.append(asp_details_text_list[i*3-2+j])
+
+            asp_details_table.append(line)
+
+        if len(asp_details_table) > 0:
+            asp_details_table_term['words'] = asp_details_table
+        return asp_details_table_term
+
+    def get_signature(self):
+        signature = self.item.copy()
+
+        for block in self.pdf_info['0']['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if '签署日期' in text:
+                        words = text
+                        signature['words'] = words
+                        signature['position'] = bbox
+        return signature
+
+    def get_somebody(self, top, bottom):
+        # 指定上下边界后,返回上下边界内的客户信息
+        _name = self.item.copy()
+        _id = self.item.copy()
+        # 只看第一页，先划定上下边界
+        y_top = 0
+        y_bottom = 0
+        for block in self.pdf_info['1']['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if top in text:
+                        y_top = bbox[3]
+                    if bottom in text:
+                        y_bottom = bbox[3]
+        for block in self.pdf_info['1']['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if y_top < bbox[3] < y_bottom:
+                        if '姓名/名称' in text:
+                            words = text.split('：')[-1]
+                            _name['position'] = bbox
+                            _name['words'] = words
+                        if '自然人身份证件号码/法人执照号码' in text:
+                            words = text.split('：')[-1]
+                            _id['position'] = bbox
+                            _id['words'] = words
+        return _name, _id
+
+    def get_seller(self):
+        seller = self.item.copy()
+        # 先找到 key
+        anchor_bbox = None
+        for block in self.pdf_info['1']['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if '经销商' == text:
+                        anchor_bbox = bbox
+        # 当找到了 key, 则根据 key 去匹配 value
+        if anchor_bbox:
+            half_width = self.pdf_info['1']['width'] * 0.5
+            for block in self.pdf_info['1']['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \
+                            anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]:
+                            seller['position'] = bbox
+                            seller['words'] = text
+        return seller
+
+    def get_payback_account(self):
+        account = self.item.copy()
+        account_name = self.item.copy()
+        account_bank = self.item.copy()
+        all_text = ''
+        for block in self.pdf_info['1']['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    all_text += text
+        # 首先确定账户信息是哪种,我们只输出非另行通知的格式
+        if '☑账号' in all_text:
+            all_text = all_text.replace('　', '')
+            matchs_1 = re.findall(r'账号：(.*)户名', all_text)
+            if matchs_1:
+                words = matchs_1[0]
+                for block in self.pdf_info['1']['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            if f'{words}' in text:
+                                account['position'] = bbox
+                                account['words'] = words
+            matchs_2 = re.findall(r'户名：(.*)开户行', all_text)
+            if matchs_2:
+                words = matchs_2[0]
+                for block in self.pdf_info['1']['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            if f'{words}' in text:
+                                account_name['position'] = bbox
+                                account_name['words'] = words
+            matchs_3 = re.findall(r'开户行：(.*)；', all_text)
+            if matchs_3:
+                words = matchs_3[0]
+                for block in self.pdf_info['1']['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            if f'开户行：{words}；' in text.replace('　', ''):
+                                account_bank['position'] = bbox
+                                account_bank['words'] = words
+        return account, account_name, account_bank
+
+    def get_repayment_schedule(self):
+        repayment_schedule = self.item.copy()
+
+        repayment_schedule_text_list = []
+        table = False
+        page = None
+        left = 0
+        right = 0
+        for pno in self.pdf_info:
+            for block in self.pdf_info[pno]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '剩余融资' in text:
+                            right = bbox[2]
+                        if '以上表格中所列序号' in text:
+                            table = False
+                        if table == True:
+                            # 过滤汉字
+                            if re.compile(r'[\u4e00-\u9fff]').search(text):
+                                continue
+                            # 过滤 1. - 61. 这些标题
+                            if re.findall("\d+", text):
+                                if len(re.findall("\d+", text)) == 1:
+                                    continue
+                            if not left < bbox[0] < right:
+                                continue
+                            repayment_schedule_text_list.append(text)
+
+                        if text.strip() == "61.":
+                            page = pno
+                            table = True
+                            left = bbox[0]
+        # print("repayment_schedule_text_list = ", repayment_schedule_text_list)
+        # repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']]
+        repayment_schedule_table = [['序号', '租金']]
+        for i in range(len(repayment_schedule_text_list)//4):
+            line = [f'{i+1}.']
+            # 4表示4列的意思
+            for j in range(4):
+                line.append(repayment_schedule_text_list[i*4+j])
+
+            # 只保留序号和租金列
+            line = [line[0].replace('.', ''), line[3]]
+
+            repayment_schedule_table.append(line)
+
+        repayment_schedule['words'] = repayment_schedule_table
+        repayment_schedule['page'] = page
+        return repayment_schedule
+
+    def get_signature_role_1(self):
+        signature_role_1 = self.item.copy()
+        for pno in self.pdf_info:
+            for block in self.pdf_info[pno]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '签署日期' in text:
+                            signature_role_1['position'] = bbox
+                            signature_role_1['page'] = pno
+                            signature_role_1['words'] = text
+        return signature_role_1
+
+    def get_signature_role_2(self):
+        signature_role_2 = self.init_item.copy()
+        # 先定位签字区域
+        texts = []
+        boxes = []
+        page_num = None
+        position = None
+        words = None
+        region = False
+        for i in list(self.pdf_info.keys()):
+            for block in self.pdf_info[i]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '共同借款人(共同抵押人)' in text:
+                            region = True
+                        if '日期' in text:
+                            region = False
+                        if region == True:
+                            page_num = i
+                            texts.append(text)
+                            boxes.append(bbox)
+        if len(texts) > 4:
+            words = '有'
+        else:
+            words = '无'
+        boxes = np.array(boxes).reshape((-1, 2))
+        position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
+        signature_role_2['page_num'] = page_num
+        signature_role_2['position'] = position
+        signature_role_2['words'] = words
+        return signature_role_2
+
+    def get_signature_role_3(self):
+        signature_role_3 = self.init_item.copy()
+        # 先定位签字区域
+        texts = []
+        boxes = []
+        page_num = None
+        position = None
+        words = None
+        region = False
+        for i in list(self.pdf_info.keys()):
+            for block in self.pdf_info[i]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '保证人1' in text and int(i) != 0:
+                            region = True
+                        if '日期' in text:
+                            region = False
+                        if region == True:
+                            page_num = i
+                            texts.append(text)
+                            boxes.append(bbox)
+        if len(texts) > 4:
+            words = '有'
+        else:
+            words = '无'
+        boxes = np.array(boxes).reshape((-1, 2))
+        position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
+        signature_role_3['page_num'] = page_num
+        signature_role_3['position'] = position
+        signature_role_3['words'] = words
+        return signature_role_3
+
+    def get_signature_role_4(self):
+        signature_role_4 = self.init_item.copy()
+        # 先定位签字区域
+        texts = []
+        boxes = []
+        page_num = None
+        position = None
+        words = None
+        region = False
+        for i in list(self.pdf_info.keys()):
+            for block in self.pdf_info[i]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '保证人2' in text and int(i) != 0:
+                            region = True
+                        if '日期' in text:
+                            region = False
+                        if region == True:
+                            page_num = i
+                            texts.append(text)
+                            boxes.append(bbox)
+        if len(texts) > 4:
+            words = '有'
+        else:
+            words = '无'
+        boxes = np.array(boxes).reshape((-1, 2))
+        position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
+        signature_role_4['page_num'] = page_num
+        signature_role_4['position'] = position
+        signature_role_4['words'] = words
+        return signature_role_4
+
+    def get_signature_role_5(self):
+        signature_role_5 = self.init_item.copy()
+        # 先定位签字区域
+        texts = []
+        boxes = []
+        page_num = None
+        position = None
+        words = None
+        region = False
+        for i in list(self.pdf_info.keys()):
+            for block in self.pdf_info[i]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '见证人签字' in text and int(i) != 0:
+                            region = True
+                        if '年' in text:
+                            region = False
+                        if region == True:
+                            page_num = i
+                            texts.append(text)
+                            boxes.append(bbox)
+        # print(texts)
+        if len(texts) > 4:
+            words = '有'
+        else:
+            words = '无'
+        boxes = np.array(boxes).reshape((-1, 2))
+        position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
+        signature_role_5['page_num'] = page_num
+        signature_role_5['position'] = position
+        signature_role_5['words'] = words
+        return signature_role_5
+
+    def get_last_page_signature(self, page_num, top, bottom):
+        signature_name = self.item.copy()
+        signature_date = self.item.copy()
+        anchor_top = None
+        anchor_bottom = None
+        for block in self.pdf_info[page_num]['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    if top in text:
+                        anchor_top = bbox[1]
+                    if bottom in text:
+                        anchor_bottom = bbox[1]
+        if anchor_top is not None and anchor_bottom is not None:
+            for block in self.pdf_info[page_num]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
+                            name = text.split(' ')[0]
+                            date = text.split(':')[-1]
+                            signature_name['words'] = name
+                            signature_name['position'] = bbox
+                            signature_date['words'] = date
+                            signature_name['position'] = bbox
+        return signature_name, signature_date
+
+    def get_electronic_signature(self, top, bottom):
+        signature = self.item.copy()
+        anchor_top = None
+        anchor_bottom = None
+        for pno in self.pdf_info:
+            for block in self.pdf_info[pno]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if top in text:
+                            anchor_top = bbox[1]
+                        if bottom in text:
+                            anchor_bottom = bbox[3] 
+        if anchor_top is not None and anchor_bottom is not None:
+            for pno in self.pdf_info:
+                for block in self.pdf_info[pno]['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            # ------------ #
+                            # print("--text = ", text)
+                            if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
+                                words = text
+                                signature['words'] = words
+                                signature['page'] = pno
+                                signature['position'] = bbox
+        return signature
+
+    def get_role_info(self, role_key, page_num='0'):
+        name = self.item.copy()
+        id_num = self.item.copy()
+        representative = self.item.copy()
+
+        # 以保证人3 的左上角为定位点
+        anchor = None
+        for block in self.pdf_info[page_num]['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    # 找到角色姓名
+                    if re.match('保证人3', text) is not None:
+                        anchor = [bbox[0], bbox[1]]
+
+        if anchor is not None:
+            for block in self.pdf_info[page_num]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        # 找到角色姓名
+                        if re.match(role_key, text) is not None:
+                            words = text.split('：')[-1]
+                            name['words'] = words
+                            name['page'] = page_num
+                            name['position'] = bbox
+                        if role_key == '承租人：':
+                            # 找到证件号码且确定位置
+                            if re.match('证件号码：', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
+                                words = text.split('：')[-1]
+                                id_num['words'] = words
+                                id_num['page'] = page_num
+                                id_num['position'] = bbox
+                            # 找到法人代表且确定位置
+                            if re.match('法定代表人或授权代表：', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
+                                words = text.split('：')[-1]
+                                representative['words'] = words
+                                representative['page'] = page_num
+                                representative['position'] = bbox
+                        if role_key == '保证人1：':
+                            # 找到证件号码且确定位置
+                            if re.match('证件号码：', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
+                                words = text.split('：')[-1]
+                                id_num['words'] = words
+                                id_num['page'] = page_num
+                                id_num['position'] = bbox
+                            # 找到法人代表且确定位置
+                            if re.match('法定代表人或授权代表：', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
+                                words = text.split('：')[-1]
+                                representative['words'] = words
+                                representative['page'] = page_num
+                                representative['position'] = bbox
+                        if role_key == '保证人2：':
+                            # 找到证件号码且确定位置
+                            if re.match('证件号码：', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
+                                words = text.split('：')[-1]
+                                id_num['words'] = words
+                                id_num['page'] = page_num
+                                id_num['position'] = bbox
+                            # 找到法人代表且确定位置
+                            if re.match('法定代表人或授权代表：', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
+                                words = text.split('：')[-1]
+                                representative['words'] = words
+                                representative['page'] = page_num
+                                representative['position'] = bbox
+                        if role_key == '保证人3：':
+                            # 找到证件号码且确定位置
+                            if re.match('证件号码：', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
+                                words = text.split('：')[-1]
+                                id_num['words'] = words
+                                id_num['page'] = page_num
+                                id_num['position'] = bbox
+                            # 找到法人代表且确定位置
+                            if re.match('法定代表人或授权代表：', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
+                                words = text.split('：')[-1]
+                                representative['words'] = words
+                                representative['page'] = page_num
+                                representative['position'] = bbox
+        return name, id_num, representative
+
+    def get_table_add_product(self):
+        table_add_product = self.item.copy()
+
+        add_product_page_num = None
+        for pno in self.pdf_info:
+            for block in self.pdf_info[f'{pno}']['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '车辆附加产品（明细见下表）' in text:
+                            add_product_page_num = pno
+        ocr_results = []
+        for block in self.pdf_info[f'{add_product_page_num}']['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    xmin, ymin, xmax, ymax = bbox
+                    bbox = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax]
+                    ocr_results.append([bbox, text])
+
+        lines = [['项目', '购买价格', '实际融资金额']]
+
+        key_xm = None
+        key_gmjg = None
+        key_sjrzje = None
+        key_total = None
+
+        for index, span in enumerate(ocr_results):
+            if span[1] == '项目':
+                key_xm = index
+            if span[1] == '购买价格':
+                key_gmjg = index
+            if span[1] == '实际融资金额':
+                key_sjrzje = index
+            if span[1] == '总计':
+                key_total = index
+
+        bbox, text = ocr_results[key_xm]
+        rh = abs(bbox[1]-bbox[-1])
+        anchor = np.array(bbox).reshape((-1, 2))
+        anchor[:, 0] += 2*rh
+        anchor[:, 1] += rh
+
+        for i in range(5):
+            for span in ocr_results:
+                iou = caculate_iou(anchor, span[0])
+                if iou > 0.01 and span[1].strip() != '所购':
+                    x = get_table_info(span[0], ocr_results[key_gmjg][0], ocr_results)
+                    y = get_table_info(span[0], ocr_results[key_sjrzje][0], ocr_results)
+                    line = [span[1].replace('\u3000', ' '), x, y]
+                    # print(line)
+                    lines.append(line)
+                    anchor = np.array(span[0]).reshape((-1, 2))
+                    anchor[:, 1] += rh
+
+        total = get_table_info(ocr_results[key_total][0], ocr_results[key_sjrzje][0], ocr_results)
+        lines.append(['总计', '', total])
+        
+        # 所购　BMW悦然焕
+        # 新服务
+
+        # 所购　BMW5年10
+        # 万公里长悦保养套餐
+
+        # 所购　事故维修补偿
+        # 方案
+
+        # 所购 BMW5年10万公里
+        # 长悦保养套餐
+
+        # 所购 MINI4年6万公里长悦
+        # 保养套餐
+
+        filtered_lines = []
+        for line in lines:
+            if line[0][:2] not in ['所购', '项目', '总计']:
+                continue
+            if 'BMW悦然' in line[0]:
+                line[0] = '所购 BMW悦然焕新服务'
+            if 'BMW5年10' in line[0]:
+                line[0] = '所购 BMW5年10万公里长悦保养套餐'
+            if '事故维修补' in line[0]:
+                line[0] = '所购 事故维修补偿方案'
+            if 'MINI4年6万公里长悦' in line[0]:
+                line[0] = '所购 MINI4年6万公里长悦保养套餐'
+            filtered_lines.append(line)
+        table_add_product['words'] = filtered_lines
+        table_add_product['page'] = add_product_page_num
+        table_add_product['position'] = None
+        return table_add_product
+
+    def get_contract_no_dy(self):
+        # 查找抵押合同编号
+        contract_no = self.item.copy()
+
+        key_box = None
+        for pno in self.pdf_info:
+            for block in self.pdf_info[pno]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '抵押合同编号' in text:
+                            key_box = bbox
+
+        if key_box is not None:
+            for pno in self.pdf_info:
+                for block in self.pdf_info[pno]['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and 'CH-' in text:
+                                contract_no['position'] = bbox
+                                contract_no['page'] = pno
+                                contract_no['words'] = text
+        return contract_no
+
+    def get_dyr_name_id(self):
+        name = self.item.copy()
+        _id = self.item.copy()
+
+        key_box = None
+        for pno in self.pdf_info:
+            for block in self.pdf_info[pno]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if text == '抵押人':
+                           key_box = bbox
+                           
+        if key_box is not None:
+            rh = abs(key_box[1]-key_box[3])
+            for pno in self.pdf_info:
+                for block in self.pdf_info[pno]['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text:
+                                words = text.split('：')[-1]
+                                name['position'] = bbox
+                                name['page'] = pno
+                                name['words'] = words
+                            if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text:
+                                words = text.split('：')[-1]
+                                _id['position'] = bbox
+                                _id['page'] = pno
+                                _id['words'] = words
+        return name, _id
+
+    def get_dyrpo_name_id(self):
+        name = self.item.copy()
+        _id = self.item.copy()
+
+        key_box = None
+        for pno in self.pdf_info:
+            for block in self.pdf_info[pno]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if text == '抵押人配偶(如适':
+                           key_box = bbox
+
+        if key_box is not None:
+            rh = abs(key_box[1]-key_box[3])
+            for pno in self.pdf_info:
+                for block in self.pdf_info[pno]['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text:
+                                words = text.split('：')[-1]
+                                name['position'] = bbox
+                                name['page'] = pno
+                                name['words'] = words
+                            if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text:
+                                words = text.split('：')[-1]
+                                _id['position'] = bbox
+                                _id['page'] = pno
+                                _id['words'] = words
+        return name, _id
+
+    def get_key_value_position(self, key):
+        value = self.item.copy()
+
+        key_box = None
+        for pno in self.pdf_info:
+            for block in self.pdf_info[pno]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if text == key:
+                           key_box = bbox
+
+        if key_box is not None:
+            rh = abs(key_box[1]-key_box[3])
+            for pno in self.pdf_info:
+                for block in self.pdf_info[pno]['blocks']:
+                    if block['type'] != 0:
+                        continue
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            bbox, text = span['bbox'], span['text']
+                            if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and key_box[0] < bbox[0] and abs(key_box[2]-bbox[0]) < rh*10:
+                                words = text
+                                value['position'] = bbox
+                                value['page'] = pno
+                                value['words'] = words
+        return value
+
+    def get_role_info_3_3(self, role_key, page_num='0'):
+        name = self.item.copy()
+        id_num = self.item.copy()
+        representative = self.item.copy()
+
+        # 以保证人2 的左上角为定位点
+        anchor = None
+        for block in self.pdf_info[page_num]['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    # 找到角色姓名
+                    if re.match('保证人2', text) is not None:
+                        anchor = [bbox[0], bbox[1]]
+
+        if anchor is not None:
+            for block in self.pdf_info[page_num]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        # 找到角色姓名
+                        if re.match(role_key, text) is not None:
+                            words = text.split('：')[-1]
+                            name['words'] = words
+                            name['page'] = page_num
+                            name['position'] = bbox
+                        if role_key == '承租人一：':
+                            # 找到证件号码且确定位置
+                            if re.match('证件号码：', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
+                                words = text.split('：')[-1]
+                                id_num['words'] = words
+                                id_num['page'] = page_num
+                                id_num['position'] = bbox
+                            # 找到法人代表且确定位置
+                            if re.match('法定代表人或授权代表：', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
+                                words = text.split('：')[-1]
+                                representative['words'] = words
+                                representative['page'] = page_num
+                                representative['position'] = bbox
+                        if role_key == '共同承租人：':
+                            # 找到证件号码且确定位置
+                            if re.match('证件号码：', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
+                                words = text.split('：')[-1]
+                                id_num['words'] = words
+                                id_num['page'] = page_num
+                                id_num['position'] = bbox
+                            # 找到法人代表且确定位置
+                            if re.match('法定代表人或授权代表：', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
+                                words = text.split('：')[-1]
+                                representative['words'] = words
+                                representative['page'] = page_num
+                                representative['position'] = bbox
+                        if role_key == '保证人1：':
+                            # 找到证件号码且确定位置
+                            if re.match('证件号码：', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
+                                words = text.split('：')[-1]
+                                id_num['words'] = words
+                                id_num['page'] = page_num
+                                id_num['position'] = bbox
+                            # 找到法人代表且确定位置
+                            if re.match('法定代表人或授权代表：', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
+                                words = text.split('：')[-1]
+                                representative['words'] = words
+                                representative['page'] = page_num
+                                representative['position'] = bbox
+                        if role_key == '保证人2：':
+                            # 找到证件号码且确定位置
+                            if re.match('证件号码：', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
+                                words = text.split('：')[-1]
+                                id_num['words'] = words
+                                id_num['page'] = page_num
+                                id_num['position'] = bbox
+                            # 找到法人代表且确定位置
+                            if re.match('法定代表人或授权代表：', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
+                                words = text.split('：')[-1]
+                                representative['words'] = words
+                                representative['page'] = page_num
+                                representative['position'] = bbox
+        return name, id_num, representative
+
+    def get_value_by_findall(self, prefix, suffix, page_num):
+        value = self.item.copy()
+        all_text = ''
+        pno = page_num
+        for block in self.pdf_info[pno]['blocks']:
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    all_text += text
+        words_list = re.findall(f"{prefix}(.*?){suffix}", all_text)
+        if len(words_list) > 0:
+            for block in self.pdf_info[pno]['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if words_list[0] in text:
+                            value['position'] = bbox
+                            value['page'] = pno
+                            value['words'] = words_list[0]
+        return value
+
+    def get_info(self):
+        """
+            block['type'] == 0 : 表示该元素为图片
+        
+        Returns:
+            dict: Description
+        """
+        if len(self.pdf_info) > 0:
+            # 取 Page 1 上的合同编号
+            contract_no = self.get_contract_no(page_num='0')
+            self.init_result['合同编号'] = contract_no
+
+            # 粗略判断是否是 ‘车贷分离版本’ 的合同
+            is_cdfl = False
+            for block in self.pdf_info['0']['blocks']:
+                if block['type'] != 0:
+                    continue
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text']
+                        if '共同承租人：' in text:
+                            is_cdfl = True
+
+            if is_cdfl == False:
+                # 从第一页上取四个角色的姓名和证件号码
+                name, id_num, representative = self.get_role_info(role_key='承租人：', page_num='0')
+
+                if name["words"] == None:
+                    name, id_num, representative = self.get_role_info_3_3(role_key='承租人一：', page_num='0')
+                self.init_result['承租人-姓名'] = name
+                self.init_result['承租人-证件号码'] = id_num
+                self.init_result['承租人-法定代表人或授权代表'] = representative
+
+                name, id_num, representative = self.get_role_info(role_key='保证人1：', page_num='0')
+                self.init_result['保证人1-姓名'] = name
+                self.init_result['保证人1-证件号码'] = id_num
+                self.init_result['保证人1-法定代表人或授权代表'] = representative  
+                # if条件判别 对应3_3版本          
+                if name["words"] == None:
+                    name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人：', page_num='0')
+                    self.init_result['共同承租人-姓名'] = name
+                    self.init_result['共同承租人-证件号码'] = id_num
+                    self.init_result['共同承租人-法定代表人或授权代表'] = representative
+
+                name, id_num, representative = self.get_role_info(role_key='保证人2：', page_num='0')
+                self.init_result['保证人2-姓名'] = name
+                self.init_result['保证人2-证件号码'] = id_num
+                self.init_result['保证人2-法定代表人或授权代表'] = representative 
+                # if条件判别 对应3_3版本           
+                if name["words"] == None:
+                    name, id_num, representative = self.get_role_info_3_3(role_key='保证人1：', page_num='0')
+                    self.init_result['保证人2-姓名'] = name
+                    self.init_result['保证人2-证件号码'] = id_num
+                    self.init_result['保证人2-法定代表人或授权代表'] = representative
+
+                name, id_num, representative = self.get_role_info(role_key='保证人3：', page_num='0')
+                self.init_result['保证人3-姓名'] = name
+                self.init_result['保证人3-证件号码'] = id_num
+                self.init_result['保证人3-法定代表人或授权代表'] = representative
+                if name["words"] == None:
+                    name, id_num, representative = self.get_role_info_3_3(role_key='保证人2：', page_num='0')
+                    self.init_result['保证人3-姓名'] = name
+                    self.init_result['保证人3-证件号码'] = id_num
+                    self.init_result['保证人3-法定代表人或授权代表'] = representative
+            else:
+                name, id_num, representative = self.get_role_info_3_3(role_key='承租人一：', page_num='0')
+                self.init_result['承租人-姓名'] = name
+                self.init_result['承租人-证件号码'] = id_num
+                self.init_result['承租人-法定代表人或授权代表'] = representative
+
+                name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人：', page_num='0')
+                self.init_result['共同承租人-姓名'] = name
+                self.init_result['共同承租人-证件号码'] = id_num
+                self.init_result['共同承租人-法定代表人或授权代表'] = representative
+
+                name, id_num, representative = self.get_role_info_3_3(role_key='保证人1：', page_num='0')
+                self.init_result['保证人1-姓名'] = name
+                self.init_result['保证人1-证件号码'] = id_num
+                self.init_result['保证人1-法定代表人或授权代表'] = representative
+
+                name, id_num, representative = self.get_role_info_3_3(role_key='保证人2：', page_num='0')
+                self.init_result['保证人2-姓名'] = name
+                self.init_result['保证人2-证件号码'] = id_num
+                self.init_result['保证人2-法定代表人或授权代表'] = representative
+
+            # 在所有页面中找正文中（第二部分 融资租赁主要条款及付款计划）的那个编号，因为存在换行的情况所以暂时不带位置输出
+            contract_no = self.get_contract_no_one()
+            self.init_result['合同编号（正文）'] = contract_no
+            # 找到车辆识别代码
+            vin = self.get_key_value(key='车辆识别代码：')
+            self.init_result['车辆识别代码'] = vin
+            # 找到经销商(车辆卖方(经销商))
+            seller = self.get_key_value(key='车辆卖方（经销商）：')
+            if seller['words'] == None:
+                seller = self.get_key_value(key='车辆卖方：')
+            self.init_result['车辆卖方（经销商）'] = seller
+            # 找到 —— 车辆原始销售价格
+            vehicle_price = self.get_key_value(key='车辆原始销售价格（《机动车销售统一发票》所列金额）：')
+            self.init_result['车辆原始销售价格（《机动车销售统一发票》所列金额）'] = vehicle_price
+            # 找车辆附加产品明细（表）
+            table_add_product = self.get_table_add_product()
+            self.init_result['车辆附加产品明细表'] = table_add_product
+            # 找融资成本总额
+            financing_cost = self.get_key_value(key='融资成本总额：')
+            self.init_result['融资成本总额'] = financing_cost
+            # 找租期
+            lease_term = self.get_key_value(key='租期：')
+            self.init_result['租期'] = lease_term
+            # 找还款计划(表)
+            repayment_schedule = self.get_repayment_schedule()
+            self.init_result['付款计划表'] = repayment_schedule
+            # 找承租人收款账户户名、银行账号、银行
+            name = self.get_key_value(key='户名：', page_num='4')
+            self.init_result['承租人收款账户-户名'] = name
+            account = self.get_key_value(key='银行账号：', page_num='4')
+            self.init_result['承租人收款账户-银行账号'] = account
+            bank = self.get_key_value(key='开户银行：', page_num='4')
+            self.init_result['承租人收款账户-开户行'] = bank
+            # 找承租人扣款账户户名、银行账号、银行
+            name = self.get_key_value(key='户名：', page_num='5')
+            self.init_result['承租人扣款账户-户名'] = name
+            account = self.get_key_value(key='银行账号：', page_num='5')
+            self.init_result['承租人扣款账户-银行账号'] = account
+            bank = self.get_key_value(key='开户银行：', page_num='5')
+            self.init_result['承租人扣款账户-开户行'] = bank
+
+            # 找签字页上的系列信息
+            # 承租人姓名、签章
+            if is_cdfl == False:
+                name = self.get_key_value(key='承租人姓名：')
+                electronic_signature = self.get_electronic_signature(top='承租人姓名：', bottom='保证人1姓名：')
+
+                if name["words"] == None:
+                    name = self.get_key_value(key='承租人一姓名：')
+                    electronic_signature = self.get_electronic_signature(top='承租人一姓名：', bottom='共同承租人名称：')
+
+                self.init_result['签字页-承租人姓名'] = name
+                self.init_result['签字页-承租人签章'] = electronic_signature
+                # 保证人1姓名、签章
+                name = self.get_key_value(key='保证人1姓名：')
+                electronic_signature = self.get_electronic_signature(top='保证人1姓名：', bottom='保证人2姓名：')
+                self.init_result['签字页-保证人1姓名'] = name
+                self.init_result['签字页-保证人1签章'] = electronic_signature
+                # 这里用的是 name["words"] == ""
+                if name["words"] == "":
+                    name = self.get_key_value(key='共同承租人名称：')
+                    electronic_signature = self.get_electronic_signature(top='共同承租人名称：', bottom='保证人1姓名：')
+                    self.init_result['签字页-共同承租人姓名'] = name
+                    self.init_result['签字页-共同承租人签章'] = electronic_signature
+                # 保证人2姓名、签章
+                name = self.get_key_value(key='保证人2姓名：')
+                electronic_signature = self.get_electronic_signature(top='保证人2姓名：', bottom='保证人3姓名：')
+                self.init_result['签字页-保证人2姓名'] = name
+                self.init_result['签字页-保证人2签章'] = electronic_signature
+                # if判断条件对应3_3版本
+                if name["words"] == "":
+                    name = self.get_key_value(key='保证人1姓名：')
+                    electronic_signature = self.get_electronic_signature(top='保证人1姓名：', bottom='保证人2姓名：')
+                    self.init_result['签字页-保证人1姓名'] = name
+                    self.init_result['签字页-保证人1签章'] = electronic_signature
+                # 保证人3姓名、签章
+                name = self.get_key_value(key='保证人3姓名：')
+                electronic_signature = self.get_electronic_signature(top='保证人3姓名：', bottom='日期：')
+                self.init_result['签字页-保证人3姓名'] = name
+                self.init_result['签字页-保证人3签章'] = electronic_signature
+                # if判断条件对应3_3版本
+                if name["words"] == None:
+                    name = self.get_key_value(key='保证人2姓名：')
+                    electronic_signature = self.get_electronic_signature(top='保证人2姓名：', bottom='日期：')
+                    self.init_result['签字页-保证人2姓名'] = name
+                    self.init_result['签字页-保证人2签章'] = electronic_signature
+            else:
+                name = self.get_key_value(key='承租人一姓名：')
+                electronic_signature = self.get_electronic_signature(top='承租人一姓名：', bottom='共同承租人名称：')
+                self.init_result['签字页-承租人姓名'] = name
+                self.init_result['签字页-承租人签章'] = electronic_signature
+
+                name = self.get_key_value(key='共同承租人名称：')
+                electronic_signature = self.get_electronic_signature(top='共同承租人名称：', bottom='保证人1姓名：')
+                self.init_result['签字页-共同承租人姓名'] = name
+                self.init_result['签字页-共同承租人签章'] = electronic_signature
+
+                name = self.get_key_value(key='保证人1姓名：')
+                electronic_signature = self.get_electronic_signature(top='保证人1姓名：', bottom='保证人2姓名：')
+                self.init_result['签字页-保证人1姓名'] = name
+                self.init_result['签字页-保证人1签章'] = electronic_signature
+
+                name = self.get_key_value(key='保证人2姓名：')
+                electronic_signature = self.get_electronic_signature(top='保证人2姓名：', bottom='保证人3姓名：')
+                self.init_result['签字页-保证人2姓名'] = name
+                self.init_result['签字页-保证人2签章'] = electronic_signature
+
+        return self.init_result
+    
+    def get_info_1(self):
+        if len(self.pdf_info) > 0:
+            contract_no = self.get_contract_no(page_num='0')
+            self.init_result_1['合同编号'] = contract_no
+            # 承租人姓名
+            name = self.get_key_value(key='承租人：', page_num='0')
+            self.init_result_1['承租人-姓名'] = name
+            # 承租人证件号码
+            _id = self.get_key_value(key='证件号码：', page_num='0')
+            self.init_result_1['承租人-证件号码'] = _id
+            # 销售经销商
+            seller = self.get_key_value(key='销售经销商：', page_num='0')
+            if seller['words'] == "":
+                seller = self.get_value_by_findall('销售经销商：', '地址：', page_num='0')
+            self.init_result_1['销售经销商'] = seller
+            # 合同编号（正文）
+            contract_no = self.get_contract_no_one()
+            self.init_result_1['合同编号（正文）'] = contract_no
+            # 签字页-承租人姓名
+            name = self.get_key_value(key='姓名/名称：')
+            self.init_result_1['签字页-承租人姓名'] = name
+            # 签字页-承租人证件号码
+            _id = self.get_key_value(key='自然人身份证件号码/法人执照号码：')
+            self.init_result_1['签字页-承租人证件号码'] = _id
+            # 签字页-承租人签章
+            signature_role_1 = self.get_signature_role_1()
+            self.init_result_1['签字页-承租人签章'] = signature_role_1
+            # 签字页-销售经销商
+            seller = self.get_key_value(key='销售经销商：')
+            if seller['words'] == "":
+                # 销售经销商：深圳市宝创汽车贸易有限公司南山分公司（请授权代表签字并请盖章）
+                seller = self.get_value_by_findall('销售经销商：', '（请授权代表签字并请盖章）', page_num='3')
+            self.init_result_1['签字页-销售经销商'] = seller
+            # 经销商签章
+            pass
+        return self.init_result_1
+
+    def get_info_2(self):
+        if len(self.pdf_info) > 0:
+            contract_no = self.get_contract_no_dy()
+            self.init_result_2['合同编号'] = contract_no
+            # 合同编号（正文）
+            contract_no = self.get_contract_no_one()
+            self.init_result_2['合同编号（正文）'] = contract_no
+            # 抵押人姓名/名称
+            name, _id = self.get_dyr_name_id()
+            self.init_result_2['抵押人姓名/名称'] = name
+            self.init_result_2['抵押人证件号码'] = _id
+            # 抵押人配偶信息
+            name, _id = self.get_dyrpo_name_id()
+            self.init_result_2['抵押人配偶姓名/名称'] = name
+            self.init_result_2['抵押人配偶证件号码'] = _id
+            # 车辆识别代码
+            vin = self.get_key_value(key='车辆识别代码：')
+            self.init_result_2['车辆识别代码'] = vin
+            # 租金总额
+            rent = self.get_key_value_position(key='租金总额')
+            self.init_result_2['租金总额'] = rent
+            # 融资租赁期限
+            lease_term = self.get_key_value_position(key='融资租赁期限')
+            self.init_result_2['融资租赁期限'] = lease_term
+            # 签字页抵押人姓名和签章
+            name = self.get_key_value(key='抵押人姓名：')
+            electronic_signature = self.get_electronic_signature(top='抵押权人盖章', bottom='抵押人配偶姓名：')
+            self.init_result_2['签字页-抵押人姓名'] = name
+            self.init_result_2['签字页-抵押人签章'] = electronic_signature
+            # 签字页抵押人配偶姓名和签章
+            name = self.get_key_value(key='抵押人配偶姓名：')
+            electronic_signature = self.get_electronic_signature(top='抵押人配偶姓名：', bottom='日期')
+            self.init_result_2['签字页-抵押人配偶姓名'] = name
+            self.init_result_2['签字页-抵押人配偶签章'] = electronic_signature
+        return self.init_result_2
\ No newline at end of file
--- a/src/common/electronic_hil_contract/hil_contract_ocr.py
View file @e0d31a2
+++ b/src/common/electronic_hil_contract/hil_contract_ocr.py
View file @e0d31a2
@@ -6,9 +6,10 @@
 # @Description   :

 from .get_char import Finder
+from .get_char_fsm import Finder as FSMFinder


-def predict(pdf_info, file_cls):
+def predict(pdf_info, file_cls, is_fsm=False):
    """Summary

    Args:
@@ -58,7 +59,11 @@ def predict(pdf_info, file_cls):
        pdf_info = dict()
        for pno, page_info in enumerate(pdf_info_1):
            pdf_info[str(pno)] = page_info
-    f = Finder(pdf_info)
+
+    if is_fsm:
+        f = FSMFinder(pdf_info) 
+    else:
+        f = Finder(pdf_info)
    if file_cls == 0:
        results = f.get_info()
    if file_cls == 1:
--- a/src/common/fsm_econtract/const.py 0 → 100644
View file @e0d31a2
+++ b/src/common/fsm_econtract/const.py 0 → 100644
View file @e0d31a2
+WEP_FIELD = {
+    "0": {
+        'keys': {
+            '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})],
+            '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
+            '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
+            '合同价格（小写）': [('人民币', (r'^人民币￥.?$', ), 'top1', {})],
+            '客户签名': [('客户签名／盖章', (r'^客户签名／盖章.*$', ), 'top1', {})],
+            '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
+        },
+        'value': {
+            '客户姓名': ('text', 'right', {'offset_tuple': (-1.1, 1, 0.3, 0)}, ''),
+            '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''),
+            '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''),
+            '合同价格（小写）': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''),
+            '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'),
+            '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
+        },
+    }
+    
+}
+
+MSI_FIELD = {
+    "0": {
+        'keys': {
+            '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})],
+            '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
+            '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
+            '合同价格（小写）': [('人民币', (r'^人民币￥.?$', ), 'top1', {})],
+        },
+        'value': {
+            '客户姓名': ('text', 'right', {'offset_tuple': (-1.2, 1, 0.3, 0)}, ''),
+            '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''),
+            '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''),
+            '合同价格（小写）': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''),
+        },
+    },
+    "1": {
+        'keys': {
+            '客户签名': [('客户签名／盖章', (r'^客户签名／盖章.*$', ), 'top1', {})],
+            '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
+        },
+        'value': {
+            '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'),
+            '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
+        },
+    }
+}
+
+SC_FIELD = {
+    "0": {
+        'keys': {
+            '姓名': [('姓名', (r'^姓名.?$', r'^企业名称.?$'), 'top1', {})],
+            '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
+            '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
+            '总价': [('总价', (r'^总价.?$', ), 'top1', {})],
+        },
+        'value': {
+            '姓名': ('text', 'right', {'offset_tuple': (-2, 8, 0.5, 0)}, ''),
+            '证件类型': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''),
+            '证件号码': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''),
+            '总价': ('text', 'right', {'offset_tuple': (-2, 12, 0.5, 0)}, ''),
+        },
+    },
+    "-1": {
+        'keys': {
+            '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', r'^客户签名／盖章.*$'), 'top1', {})],
+            '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
+        },
+        'value': {
+            '客户签名': ('img', 'under', {'offset_tuple': (1.5, 1, 0, 4), 'rigorous': True}, '无'),
+            '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
+        },
+    }
+}
--- a/src/common/fsm_econtract/fsm_contract_ocr.py 0 → 100644
View file @e0d31a2
+++ b/src/common/fsm_econtract/fsm_contract_ocr.py 0 → 100644
View file @e0d31a2
+from .retriever import Retriever
+from .const import WEP_FIELD, MSI_FIELD, SC_FIELD
+from .tools import pdf_info_rebuild
+
+retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD)]
+
+def predict(pdf_info, file_type=0):
+    retriever =  retriever_list[file_type]
+    pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info) 
+    return retriever.get_target_fields(pdf_text_list, pdf_img_list)
+
+
--- a/src/common/fsm_econtract/hmh_ocr.py 0 → 100644
View file @e0d31a2
+++ b/src/common/fsm_econtract/hmh_ocr.py 0 → 100644
View file @e0d31a2
+from .retriever import HMHRetriever
+from .tools import pdf_info_rebuild
+
+hmh_retriever = HMHRetriever() 
+
+def predict(pdf_info):
+    pdf_text_list, _ = pdf_info_rebuild(pdf_info, fix_bbox=False) 
+    return hmh_retriever.get_target_fields(pdf_text_list)
+
+
--- a/src/common/fsm_econtract/retriever.py 0 → 100644
View file @e0d31a2
+++ b/src/common/fsm_econtract/retriever.py 0 → 100644
View file @e0d31a2
+import re
+
+
+class HMHRetriever:
+
+    def __init__(self):
+        self.words_str = 'words'
+        self.position_str = 'location'
+        self.default_position = [0, 0, 0, 0] 
+        self.search_fields_list = [
+            ('借款/承租人姓名', ''),
+            ('证件号码', ''),
+            ('渠道', ''),
+            ('合同编号', ''),
+            ('借款人签字/盖章', '无'),
+        ]
+
+    def get_target_fields(self, pdf_text_list):
+        result = dict()
+        is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False
+        for bbox, text in pdf_text_list.pop(str(0), []):
+            # print(text)
+            if not is_find_name_id_company:
+                name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', text)
+                for name_id_company_tuple in name_id_company_list:
+                    if len(name_id_company_tuple) == 3: 
+                        result[self.search_fields_list[0][0]] = {
+                            self.words_str: name_id_company_tuple[0].replace('\u3000', '').strip(),
+                            self.position_str: bbox
+                        }
+                        result[self.search_fields_list[1][0]] = {
+                            self.words_str: name_id_company_tuple[1].replace('\u3000', '').replace(')', '').replace('）', '').strip(),
+                            self.position_str: bbox
+                        }
+                        result[self.search_fields_list[2][0]] = {
+                            self.words_str: name_id_company_tuple[2],
+                            self.position_str: bbox
+                        }
+                        is_find_name_id_company = True
+                        break
+            if not is_find_application_no:
+                application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text)
+                if len(application_no_list) == 1:
+                    result[self.search_fields_list[3][0]] = {
+                        self.words_str: application_no_list[0],
+                        self.position_str: bbox
+                    }
+                    is_find_application_no = True
+            if not is_find_name_date:
+                name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text)
+                for name_date_tuple in name_date_list:
+                    if len(name_date_tuple) == 2: 
+                        result[self.search_fields_list[4][0]] = {
+                            self.words_str: '{0} {1}'.format(name_date_tuple[0].replace('\u3000', '').strip(), name_date_tuple[1]),
+                            self.position_str: bbox
+                        }
+                        is_find_name_date = True
+                        break
+        
+        for find_key, default_value in self.search_fields_list:
+            if find_key not in result:
+                result[find_key] = {
+                    self.words_str: default_value,
+                    self.position_str: self.default_position,
+                }
+        # simple_result = []
+        # for key, value_dict in result.items():
+        #     simple_result.append((key, value_dict[self.words_str]))
+
+        # return simple_result
+        return {"words_result": result}
+
+class Retriever:
+
+    def __init__(self, target_fields):
+        self.keys_str = 'keys'
+        self.value_str = 'value'
+        self.text_str = 'text'
+        self.words_str = 'words'
+        self.position_str = 'position'
+        self.default_position = [-1, -1, -1, -1] 
+        self.target_fields = target_fields
+        self.replace_map = {
+            'int': {
+                '(': '0'
+            }
+        }
+
+    @staticmethod
+    def key_top1(coordinates_list, key_coordinates):
+        # 关键词查找方向：最上面
+        coordinates_list.sort(key=lambda x: x[1])
+        return coordinates_list[0]
+
+    def key_right(self, coordinates_list, key_coordinates, offset_tuple, rigorous=False):
+        # 关键词查找方向：右侧
+        if len(coordinates_list) == 1:
+            return coordinates_list[0]
+
+        # 没有上一层关键词的坐标时，返回最上面的坐标
+        if key_coordinates is None:
+            return self.key_top1(coordinates_list, key_coordinates)
+
+        x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
+
+        x_min_find, find_key_coordinates = None, None
+        for x0, y0, x1, y1 in coordinates_list:
+            if rigorous:
+                is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
+            else:
+                cent_x = x0 + ((x1 - x0) / 2)
+                cent_y = y0 + ((y1 - y0) / 2)
+                is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
+            if is_eligible:
+                if x_min_find is None or x0 < x_min_find:
+                    x_min_find = x0
+                    find_key_coordinates = (x0, y0, x1, y1)
+
+        if find_key_coordinates is None:
+            return self.key_top1(coordinates_list, key_coordinates)
+        else:
+            return find_key_coordinates
+
+    def value_right(self, search_list, key_coordinates, offset_tuple, value_type=None, rigorous=False):
+        # 字段值查找方向：右侧
+        x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
+
+        x_min_find, value, coordinates = None, None, None
+        for (x0, y0, x1, y1), text in search_list:
+            if rigorous:
+                is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
+            else:
+                cent_x = x0 + ((x1 - x0) / 2)
+                cent_y = y0 + ((y1 - y0) / 2)
+                is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
+            if is_eligible:
+                if x_min_find is None or x0 < x_min_find:
+                    if len(text.strip()) > 0:
+                        x_min_find = x0
+                        value = text
+                        coordinates = (x0, y0, x1, y1)
+
+        if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
+            new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
+            return new_value, coordinates
+
+        return value, coordinates
+
+    def value_under(self, search_list, key_coordinates, offset_tuple, value_type=None, append=False, rigorous=False):
+        # 字段值查找方向：下方
+        x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
+
+        find_list = []
+        for (x0, y0, x1, y1), text in search_list:
+            if rigorous:
+                is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
+            else:
+                cent_x = x0 + ((x1 - x0) / 2)
+                cent_y = y0 + ((y1 - y0) / 2)
+                is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
+            if is_eligible:
+                if len(text.strip()) > 0:
+                    find_list.append((x0, y0, x1, y1, text))
+
+        if len(find_list) == 0:
+            return None, None
+        else:
+            find_list.sort(key=lambda x: (x[1], x[0]))
+            coordinates = find_list[0][:-1]
+            if append:
+                value = ''.join([text for _, _, _, _, text in find_list])
+            else:
+                value = find_list[0][-1]
+
+            if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
+                new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
+                return new_value, coordinates
+
+            return value, coordinates
+    
+    @staticmethod
+    def get_target_bbox(key_coordinates, offset_tuple):
+        offset_xmin, offset_xmax, offset_ymin, offset_ymax = offset_tuple 
+
+        width = key_coordinates[2] - key_coordinates[0]
+        height = key_coordinates[-1] - key_coordinates[1]
+
+        x_min = key_coordinates[0] - (width * offset_xmin) # -1
+        x_max = key_coordinates[2] + (width * offset_xmax)
+        y_min = key_coordinates[1] - (height * offset_ymin) # -1
+        y_max = key_coordinates[-1] + (height * offset_ymax)
+        return x_min, y_min, x_max, y_max
+
+    def get_target_fields(self, pdf_text_list, pdf_img_list):
+        pdf_result = dict()
+
+        for pno_str, fields_dict in self.target_fields.items():
+            if pno_str == '-1':
+                pno_int_list = [int(pno_str) for pno_str in pdf_text_list.keys()]
+                pno_str = str(max(pno_int_list)) 
+
+            # 搜索关键词
+            key_text_info = dict()
+            for key_text_list in fields_dict[self.keys_str].values():
+                for key_text, key_re_tuple, _, _ in key_text_list:
+                    for (x0, y0, x1, y1), text in pdf_text_list.get(pno_str, []):
+                        for key_re in key_re_tuple:
+                            if re.match(key_re, text):
+                                key_text_info.setdefault(key_text, list()).append((x0, y0, x1, y1))
+
+            # 搜索关键词
+            key_coordinates_info = dict()
+            for field, key_text_list in fields_dict[self.keys_str].items():
+                last_key_coordinates = None
+                for key_text, _, direction, kwargs in key_text_list:
+                    if key_text not in key_text_info:
+                        last_key_coordinates = None
+                        continue
+                    last_key_coordinates = getattr(self, 'key_{0}'.format(direction))(
+                        key_text_info[key_text],
+                        last_key_coordinates,
+                        **kwargs)
+
+                key_coordinates_info[field] = last_key_coordinates
+
+            # 搜索字段值
+            page_result = dict()
+            for field, (source, direction, kwargs, default_value) in fields_dict[self.value_str].items():
+                if not isinstance(key_coordinates_info.get(field), tuple):
+                    page_result[field] = {
+                        self.words_str: default_value,
+                        self.position_str: self.default_position,
+                    }
+                    continue
+                value, coordinates = getattr(self, 'value_{0}'.format(direction))(
+                    pdf_text_list.get(pno_str, []) if source == self.text_str else pdf_img_list.get(pno_str, []),
+                    key_coordinates_info[field],
+                    **kwargs
+                )
+                if not isinstance(value, str):
+                    page_result[field] = {
+                        self.words_str: default_value,
+                        self.position_str: self.default_position,
+                    }
+                else:
+                    page_result[field] = {
+                        self.words_str: value,
+                        self.position_str: list(coordinates),
+                    }
+            
+            pdf_result['page_{0}'.format(int(pno_str) + 1)] = page_result
+
+        return pdf_result 
--- a/src/common/fsm_econtract/tools.py 0 → 100644
View file @e0d31a2
+++ b/src/common/fsm_econtract/tools.py 0 → 100644
View file @e0d31a2
+def pdf_info_rebuild(pdf_info, fix_bbox=True):
+    pdf_text_info = dict()
+    pdf_img_info = dict()
+    for pno_str, page_info in pdf_info.items():
+        text_set = set()
+        for block in page_info['blocks']:
+            if block['type'] == 0:
+                # text有重复的现象
+                text_set.clear()
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text'].strip()
+                        if len(text) != 0 and text not in text_set:
+                            text_set.add(text)
+                            # bbox的高，不准
+                            if fix_bbox and bbox[-1] - bbox[1] < span['size']:
+                                bbox[-1] = bbox[-1] + span['size']
+                            pdf_text_info.setdefault(pno_str, list()).append([bbox, text])
+            elif block['type'] == 1:
+                pdf_img_info.setdefault(pno_str, list()).append((block['bbox'], '有'))
+
+    return pdf_text_info, pdf_img_info
\ No newline at end of file