Merge branch 'feature/fsm-contract' into fix/report_ca

周伟奇
Showing 7 changed files with 482 additions and 17 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/ocr_process.py
src/common/fsm_econtract/const.py
src/common/fsm_econtract/fsm_contract_ocr.py
src/common/fsm_econtract/hmh_ocr.py
src/common/fsm_econtract/retriever.py
src/common/fsm_econtract/tools.py
--- a/src/apps/doc/consts.py
View file @d784780
+++ b/src/apps/doc/consts.py
View file @d784780
@@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44
 HIL_CONTRACT_3_CN_NAME = '车辆处置协议'
 HIL_CONTRACT_3_CLASSIFY = 45

-CONTRACT_SET = {CONTRACT_QRS_CLASSIFY, CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY}
+FSM_CONTRACT_WEP_CN_NAME = '延长保修合同'
+FSM_CONTRACT_WEP_CLASSIFY = 51
+
+FSM_CONTRACT_MSI_CN_NAME = '长悦保养合同'
+FSM_CONTRACT_MSI_CLASSIFY = 52
+
+FSM_CONTRACT_SC_CN_NAME = '汽车销售合同'
+FSM_CONTRACT_SC_CLASSIFY = 53
+
+CONTRACT_SET = {
+    CONTRACT_QRS_CLASSIFY, 
+    CONTRACT_CLASSIFY, 
+    HIL_CONTRACT_1_CLASSIFY, 
+    HIL_CONTRACT_2_CLASSIFY, 
+    HIL_CONTRACT_3_CLASSIFY,
+    FSM_CONTRACT_WEP_CLASSIFY,
+    FSM_CONTRACT_MSI_CLASSIFY,
+    FSM_CONTRACT_SC_CLASSIFY, 
+}

 CONTRACT_MAP = {
    HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME,
@@ -1065,6 +1083,10 @@ CONTRACT_MAP = {
    HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME,
    CONTRACT_CLASSIFY: CONTRACT_CN_NAME,
    CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME,
+    FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_CN_NAME,
+    FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_CN_NAME,
+    FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_CN_NAME, 
+
 }

 # 保单
@@ -1214,6 +1236,11 @@ BS_FIELD = 'bss_ocr'
 HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr'
 HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr'
 HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr'
+FSM_CONTRACT_WEP_FIELD = 'fsm_wep_ocr',
+FSM_CONTRACT_MSI_FIELD = 'fsm_msi_ocr',
+FSM_CONTRACT_SC_FIELD = 'fsm_sc_ocr', 
+
+
 BS_CLASSIFY = 10089

 RESULT_MAPPING = {
@@ -1238,6 +1265,9 @@ RESULT_MAPPING = {
    HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD,
    HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD,
    HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD,
+    FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_FIELD,
+    FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_FIELD,
+    FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_FIELD, 
 }

 CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD)
@@ -2313,29 +2343,35 @@ APPLICANT_TYPE_MAP = {

 APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager']

-FILE_NAME_PREFIX_MAP = {
-    AFC_PREFIX: [
-        ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'),
-        ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
-    ],
-    HIL_PREFIX: [
-        ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'),
-        ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'),
-        ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
-    ]
-}
+# FILE_NAME_PREFIX_MAP = {
+#     AFC_PREFIX: [
+#         ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'),
+#         ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
+#     ],
+#     HIL_PREFIX: [
+#         ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'),
+#         ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'),
+#         ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
+#     ]
+# }

 ECONTRACT_KEYWORDS_MAP = {
    AFC_PREFIX: [
        ('抵押贷款合同', CONTRACT_CLASSIFY),
        ('送达地址确认书', CONTRACT_QRS_CLASSIFY),
-        # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0),
+        ('抵押登记豁免函', HMH_CLASSIFY),
+        ('延长保修', FSM_CONTRACT_WEP_CLASSIFY),
+        ('长悦保养', FSM_CONTRACT_MSI_CLASSIFY),
+        ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
    ],
    HIL_PREFIX: [
        ('售后回租合同', HIL_CONTRACT_1_CLASSIFY),
        ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY),
        ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY),
-        # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0),
+        ('抵押登记豁免函', HMH_CLASSIFY),
+        ('延长保修', FSM_CONTRACT_WEP_CLASSIFY),
+        ('长悦保养', FSM_CONTRACT_MSI_CLASSIFY),
+        ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
    ]
 }

@@ -2345,6 +2381,12 @@ HIL_CONTRACT_TYPE_MAP = {
    str(HIL_CONTRACT_3_CLASSIFY): 1,
 }

+FSM_CONTRACT_TYPE_MAP = {
+    str(FSM_CONTRACT_WEP_CLASSIFY): 0,
+    str(FSM_CONTRACT_MSI_CLASSIFY): 1,
+    str(FSM_CONTRACT_SC_CLASSIFY): 2,
+}
+
 RESULT_MAP = {
    0: None,
    1: True,
--- a/src/apps/doc/management/commands/ocr_process.py
View file @d784780
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @d784780
@@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g
 from common.tools.pdf_to_img import PDFHandler
 from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict
 from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict
+from common.fsm_econtract.fsm_contract_ocr import predict as fsm_predict
+from common.fsm_econtract.hmh_ocr import predict as hmh_predict
 from apps.doc import consts
 # from apps.doc.ocr.edms import EDMS, rh
 from apps.doc.ocr.ecm import ECM, rh
@@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin):
                    res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get(
                        consts.ALL_POSITION_KEY, {}).get(key1, [])
                license_summary[classify] = [res]
-            else:
+            elif classify in consts.SE_HIL_CON_MAP:  # TODO FSM新合同写入数据库用于比对
                res = {}
                for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items():
                    if pno1 is None:
@@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin):
                                self.log_base, traceback.format_exc()))
                            error_list.append(1)
                            return
-                else:  # e-contract
+                else:  # e-contract or or e-fsm-contract or e-hmh
                    try:
                        # pdf下载 处理 图片存储 识别
                        for times in range(consts.RETRY_TIMES):
@@ -1472,6 +1474,7 @@ class Command(BaseCommand, LoggerMixin):
                            self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
                                self.log_base, traceback.format_exc()))

+                        # AFC合同
                        if classify_1_str == str(consts.CONTRACT_CLASSIFY):
                            ocr_result = afc_predict(pdf_handler.pdf_info)
                            page_res = {}
@@ -1483,6 +1486,7 @@ class Command(BaseCommand, LoggerMixin):
                                        'page_num': page_num,
                                        'page_info': page_info
                                    }
+                        # 送达地址确认书
                        elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY):
                            ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True)
                            page_num = 'page_1'
@@ -1493,7 +1497,8 @@ class Command(BaseCommand, LoggerMixin):
                                    'page_info': ocr_result.pop(page_num, {}) 
                                }
                            }
-                        else:
+                        # HIL合同
+                        elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP:
                            file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str)
                            ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1)
                            rebuild_res_1 = {}
@@ -1508,9 +1513,35 @@ class Command(BaseCommand, LoggerMixin):
                                        'page_num': page_num,
                                        'page_info': page_info
                                    }
+                        # FSM合同 WEP MSI SC
+                        elif classify_1_str in consts.FSM_CONTRACT_TYPE_MAP:
+                            file_type = consts.FSM_CONTRACT_TYPE_MAP.get(classify_1_str)
+                            ocr_result = fsm_predict(pdf_handler.pdf_info, file_type) 
+                            for page_num, page_info in ocr_result.items():
+                                if isinstance(page_num, str) and page_num.startswith('page_'):
+                                    page_res[page_num] = {
+                                        'classify': int(classify_1_str),
+                                        'page_num': page_num,
+                                        'page_info': page_info
+                                    }
+                        # hmh
+                        else:
+                            pass
+

                        contract_res = {}
                        for img_path_tmp, page_key in pdf_handler.img_path_pno_list:
+                            if classify_1_str == str(consts.HMH_CLASSIFY):
+                                img_contract_res = {
+                                        'code': 1,
+                                        'data': [
+                                            {
+                                                'classify': consts.HMH_CLASSIFY,
+                                                'data': hmh_predict(pdf_handler.pdf_info)
+                                            }
+                                        ]
+                                    }
+                            else:
                                if page_key in page_res:
                                    img_contract_res = {
                                        'code': 1,
--- a/src/common/fsm_econtract/const.py 0 → 100644
View file @d784780
+++ b/src/common/fsm_econtract/const.py 0 → 100644
View file @d784780
+WEP_FIELD = {
+    "0": {
+        'keys': {
+            '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})],
+            '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
+            '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
+            '合同价格（小写）': [('人民币', (r'^人民币￥.?$', ), 'top1', {})],
+            '客户签名': [('客户签名／盖章', (r'^客户签名／盖章.*$', ), 'top1', {})],
+            '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
+        },
+        'value': {
+            '客户姓名': ('text', 'right', {'offset_tuple': (-1.1, 1, 0.3, 0)}, ''),
+            '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''),
+            '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''),
+            '合同价格（小写）': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''),
+            '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'),
+            '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
+        },
+    }
+    
+}
+
+MSI_FIELD = {
+    "0": {
+        'keys': {
+            '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})],
+            '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
+            '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
+            '合同价格（小写）': [('人民币', (r'^人民币￥.?$', ), 'top1', {})],
+        },
+        'value': {
+            '客户姓名': ('text', 'right', {'offset_tuple': (-1.2, 1, 0.3, 0)}, ''),
+            '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''),
+            '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''),
+            '合同价格（小写）': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''),
+        },
+    },
+    "1": {
+        'keys': {
+            '客户签名': [('客户签名／盖章', (r'^客户签名／盖章.*$', ), 'top1', {})],
+            '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
+        },
+        'value': {
+            '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'),
+            '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
+        },
+    }
+}
+
+SC_FIELD = {
+    "0": {
+        'keys': {
+            '姓名': [('姓名', (r'^姓名.?$', r'^企业名称.?$'), 'top1', {})],
+            '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
+            '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
+            '总价': [('总价', (r'^总价.?$', ), 'top1', {})],
+        },
+        'value': {
+            '姓名': ('text', 'right', {'offset_tuple': (-2, 8, 0.5, 0)}, ''),
+            '证件类型': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''),
+            '证件号码': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''),
+            '总价': ('text', 'right', {'offset_tuple': (-2, 12, 0.5, 0)}, ''),
+        },
+    },
+    "-1": {
+        'keys': {
+            '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', r'^客户签名／盖章.*$'), 'top1', {})],
+            '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
+        },
+        'value': {
+            '客户签名': ('img', 'under', {'offset_tuple': (1.5, 1, 0, 4), 'rigorous': True}, '无'),
+            '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
+        },
+    }
+}
--- a/src/common/fsm_econtract/fsm_contract_ocr.py 0 → 100644
View file @d784780
+++ b/src/common/fsm_econtract/fsm_contract_ocr.py 0 → 100644
View file @d784780
+from .retriever import Retriever
+from .const import WEP_FIELD, MSI_FIELD, SC_FIELD
+from .tools import pdf_info_rebuild
+
+retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD)]
+
+def predict(pdf_info, file_type=0):
+    retriever =  retriever_list[file_type]
+    pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info) 
+    return retriever.get_target_fields(pdf_text_list, pdf_img_list)
+
+
--- a/src/common/fsm_econtract/hmh_ocr.py 0 → 100644
View file @d784780
+++ b/src/common/fsm_econtract/hmh_ocr.py 0 → 100644
View file @d784780
+from .retriever import HMHRetriever
+from .tools import pdf_info_rebuild
+
+hmh_retriever = HMHRetriever() 
+
+def predict(pdf_info):
+    pdf_text_list, _ = pdf_info_rebuild(pdf_info, fix_bbox=False) 
+    return hmh_retriever.get_target_fields(pdf_text_list)
+
+
--- a/src/common/fsm_econtract/retriever.py 0 → 100644
View file @d784780
+++ b/src/common/fsm_econtract/retriever.py 0 → 100644
View file @d784780
+import re
+
+
+class HMHRetriever:
+
+    def __init__(self):
+        self.words_str = 'words'
+        self.position_str = 'location'
+        self.default_position = [0, 0, 0, 0] 
+        self.search_fields_list = [
+            ('借款/承租人姓名', ''),
+            ('证件号码', ''),
+            ('渠道', ''),
+            ('合同编号', ''),
+            ('借款人签字/盖章', '无'),
+        ]
+
+    def get_target_fields(self, pdf_text_list):
+        result = dict()
+        is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False
+        for bbox, text in pdf_text_list.pop(str(0), []):
+            # print(text)
+            if not is_find_name_id_company:
+                # name_id_company_list = re.findall(r'借款人\(姓名(.*)证件号码(.*)与(.*公司)', text)
+                name_id_company_list = re.findall(r'承租人\(姓名(.*)证件号码(.*)与(.*公司)', text)
+                for name_id_company_tuple in name_id_company_list:
+                    if len(name_id_company_tuple) == 3: 
+                        result[self.search_fields_list[0][0]] = {
+                            self.words_str: name_id_company_tuple[0].replace('\u3000', '').strip(),
+                            self.position_str: bbox
+                        }
+                        result[self.search_fields_list[1][0]] = {
+                            self.words_str: name_id_company_tuple[1].replace('\u3000', '').strip(),
+                            self.position_str: bbox
+                        }
+                        result[self.search_fields_list[2][0]] = {
+                            self.words_str: name_id_company_tuple[2],
+                            self.position_str: bbox
+                        }
+                        is_find_name_id_company = True
+                        break
+            if not is_find_name_id_company:
+                name_id_company_list = re.findall(r'借款人\(姓名(.*)证件号码(.*)与(.*公司)', text)
+                # name_id_company_list = re.findall(r'承租人\(姓名(.*)证件号码(.*)与(.*公司)', text)
+                for name_id_company_tuple in name_id_company_list:
+                    if len(name_id_company_tuple) == 3: 
+                        result[self.search_fields_list[0][0]] = {
+                            self.words_str: name_id_company_tuple[0].replace('\u3000', '').strip(),
+                            self.position_str: bbox
+                        }
+                        result[self.search_fields_list[1][0]] = {
+                            self.words_str: name_id_company_tuple[1].replace('\u3000', '').strip(),
+                            self.position_str: bbox
+                        }
+                        result[self.search_fields_list[2][0]] = {
+                            self.words_str: name_id_company_tuple[2],
+                            self.position_str: bbox
+                        }
+                        is_find_name_id_company = True
+                        break
+            if not is_find_application_no:
+                application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text)
+                if len(application_no_list) == 1:
+                    result[self.search_fields_list[3][0]] = {
+                        self.words_str: application_no_list[0],
+                        self.position_str: bbox
+                    }
+                    is_find_application_no = True
+            if not is_find_name_date:
+                name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text)
+                for name_date_tuple in name_date_list:
+                    if len(name_date_tuple) == 2: 
+                        result[self.search_fields_list[4][0]] = {
+                            self.words_str: '{0} {1}'.format(name_date_tuple[0].replace('\u3000', '').strip(), name_date_tuple[1]),
+                            self.position_str: bbox
+                        }
+                        is_find_name_date = True
+                        break
+        
+        for find_key, default_value in self.search_fields_list:
+            if find_key not in result:
+                result[find_key] = {
+                    self.words_str: default_value,
+                    self.position_str: self.default_position,
+                }
+        # simple_result = []
+        # for key, value_dict in result.items():
+        #     simple_result.append((key, value_dict[self.words_str]))
+
+        # return simple_result
+        return {"words_result": result}
+
+class Retriever:
+
+    def __init__(self, target_fields):
+        self.keys_str = 'keys'
+        self.value_str = 'value'
+        self.text_str = 'text'
+        self.words_str = 'words'
+        self.position_str = 'position'
+        self.default_position = [-1, -1, -1, -1] 
+        self.target_fields = target_fields
+        self.replace_map = {
+            'int': {
+                '(': '0'
+            }
+        }
+
+    @staticmethod
+    def key_top1(coordinates_list, key_coordinates):
+        # 关键词查找方向：最上面
+        coordinates_list.sort(key=lambda x: x[1])
+        return coordinates_list[0]
+
+    def key_right(self, coordinates_list, key_coordinates, offset_tuple, rigorous=False):
+        # 关键词查找方向：右侧
+        if len(coordinates_list) == 1:
+            return coordinates_list[0]
+
+        # 没有上一层关键词的坐标时，返回最上面的坐标
+        if key_coordinates is None:
+            return self.key_top1(coordinates_list, key_coordinates)
+
+        x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
+
+        x_min_find, find_key_coordinates = None, None
+        for x0, y0, x1, y1 in coordinates_list:
+            if rigorous:
+                is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
+            else:
+                cent_x = x0 + ((x1 - x0) / 2)
+                cent_y = y0 + ((y1 - y0) / 2)
+                is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
+            if is_eligible:
+                if x_min_find is None or x0 < x_min_find:
+                    x_min_find = x0
+                    find_key_coordinates = (x0, y0, x1, y1)
+
+        if find_key_coordinates is None:
+            return self.key_top1(coordinates_list, key_coordinates)
+        else:
+            return find_key_coordinates
+
+    def value_right(self, search_list, key_coordinates, offset_tuple, value_type=None, rigorous=False):
+        # 字段值查找方向：右侧
+        x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
+
+        x_min_find, value, coordinates = None, None, None
+        for (x0, y0, x1, y1), text in search_list:
+            if rigorous:
+                is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
+            else:
+                cent_x = x0 + ((x1 - x0) / 2)
+                cent_y = y0 + ((y1 - y0) / 2)
+                is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
+            if is_eligible:
+                if x_min_find is None or x0 < x_min_find:
+                    if len(text.strip()) > 0:
+                        x_min_find = x0
+                        value = text
+                        coordinates = (x0, y0, x1, y1)
+
+        if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
+            new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
+            return new_value, coordinates
+
+        return value, coordinates
+
+    def value_under(self, search_list, key_coordinates, offset_tuple, value_type=None, append=False, rigorous=False):
+        # 字段值查找方向：下方
+        x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
+
+        find_list = []
+        for (x0, y0, x1, y1), text in search_list:
+            if rigorous:
+                is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
+            else:
+                cent_x = x0 + ((x1 - x0) / 2)
+                cent_y = y0 + ((y1 - y0) / 2)
+                is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
+            if is_eligible:
+                if len(text.strip()) > 0:
+                    find_list.append((x0, y0, x1, y1, text))
+
+        if len(find_list) == 0:
+            return None, None
+        else:
+            find_list.sort(key=lambda x: (x[1], x[0]))
+            coordinates = find_list[0][:-1]
+            if append:
+                value = ''.join([text for _, _, _, _, text in find_list])
+            else:
+                value = find_list[0][-1]
+
+            if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
+                new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
+                return new_value, coordinates
+
+            return value, coordinates
+    
+    @staticmethod
+    def get_target_bbox(key_coordinates, offset_tuple):
+        offset_xmin, offset_xmax, offset_ymin, offset_ymax = offset_tuple 
+
+        width = key_coordinates[2] - key_coordinates[0]
+        height = key_coordinates[-1] - key_coordinates[1]
+
+        x_min = key_coordinates[0] - (width * offset_xmin) # -1
+        x_max = key_coordinates[2] + (width * offset_xmax)
+        y_min = key_coordinates[1] - (height * offset_ymin) # -1
+        y_max = key_coordinates[-1] + (height * offset_ymax)
+        return x_min, y_min, x_max, y_max
+
+    def get_target_fields(self, pdf_text_list, pdf_img_list):
+        pdf_result = dict()
+
+        for pno_str, fields_dict in self.target_fields.items():
+            if pno_str == '-1':
+                pno_int_list = [int(pno_str) for pno_str in pdf_text_list.keys()]
+                pno_str = str(max(pno_int_list)) 
+
+            # 搜索关键词
+            key_text_info = dict()
+            for key_text_list in fields_dict[self.keys_str].values():
+                for key_text, key_re_tuple, _, _ in key_text_list:
+                    for (x0, y0, x1, y1), text in pdf_text_list.get(pno_str, []):
+                        for key_re in key_re_tuple:
+                            if re.match(key_re, text):
+                                key_text_info.setdefault(key_text, list()).append((x0, y0, x1, y1))
+
+            # 搜索关键词
+            key_coordinates_info = dict()
+            for field, key_text_list in fields_dict[self.keys_str].items():
+                last_key_coordinates = None
+                for key_text, _, direction, kwargs in key_text_list:
+                    if key_text not in key_text_info:
+                        last_key_coordinates = None
+                        continue
+                    last_key_coordinates = getattr(self, 'key_{0}'.format(direction))(
+                        key_text_info[key_text],
+                        last_key_coordinates,
+                        **kwargs)
+
+                key_coordinates_info[field] = last_key_coordinates
+
+            # 搜索字段值
+            page_result = dict()
+            for field, (source, direction, kwargs, default_value) in fields_dict[self.value_str].items():
+                if not isinstance(key_coordinates_info.get(field), tuple):
+                    page_result[field] = {
+                        self.words_str: default_value,
+                        self.position_str: self.default_position,
+                    }
+                    continue
+                value, coordinates = getattr(self, 'value_{0}'.format(direction))(
+                    pdf_text_list.get(pno_str, []) if source == self.text_str else pdf_img_list.get(pno_str, []),
+                    key_coordinates_info[field],
+                    **kwargs
+                )
+                if not isinstance(value, str):
+                    page_result[field] = {
+                        self.words_str: default_value,
+                        self.position_str: self.default_position,
+                    }
+                else:
+                    page_result[field] = {
+                        self.words_str: value,
+                        self.position_str: list(coordinates),
+                    }
+            
+            pdf_result['page_{0}'.format(int(pno_str) + 1)] = page_result
+
+        return pdf_result 
--- a/src/common/fsm_econtract/tools.py 0 → 100644
View file @d784780
+++ b/src/common/fsm_econtract/tools.py 0 → 100644
View file @d784780
+def pdf_info_rebuild(pdf_info, fix_bbox=True):
+    pdf_text_info = dict()
+    pdf_img_info = dict()
+    for pno_str, page_info in pdf_info.items():
+        text_set = set()
+        for block in page_info['blocks']:
+            if block['type'] == 0:
+                # text有重复的现象
+                text_set.clear()
+                for line in block['lines']:
+                    for span in line['spans']:
+                        bbox, text = span['bbox'], span['text'].strip()
+                        if len(text) != 0 and text not in text_set:
+                            text_set.add(text)
+                            # bbox的高，不准
+                            if fix_bbox and bbox[-1] - bbox[1] < span['size']:
+                                bbox[-1] = bbox[-1] + span['size']
+                            pdf_text_info.setdefault(pno_str, list()).append([bbox, text])
+            elif block['type'] == 1:
+                pdf_img_info.setdefault(pno_str, list()).append((block['bbox'], '有'))
+
+        return pdf_text_info, pdf_img_info
\ No newline at end of file