e0d31a26 by 周伟奇

Merge branch 'fix/report_ca' into feature/uat-tmp

2 parents 4398d1df e2de024d
......@@ -11,7 +11,7 @@ PAGE_SIZE_DEFAULT = 10
FIXED_APPLICATION_ID_PREFIX = 'CH-S'
DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT']
DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT']
DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT', 'OVP']
COMPARE_DOC_SCHEME_LIST = ['CA', 'SE']
HIL_PREFIX = 'HIL'
......@@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44
HIL_CONTRACT_3_CN_NAME = '车辆处置协议'
HIL_CONTRACT_3_CLASSIFY = 45
CONTRACT_SET = {CONTRACT_QRS_CLASSIFY, CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY}
FSM_CONTRACT_WEP_CN_NAME = '延长保修合同'
FSM_CONTRACT_WEP_CLASSIFY = 51
FSM_CONTRACT_MSI_CN_NAME = '长悦保养合同'
FSM_CONTRACT_MSI_CLASSIFY = 52
FSM_CONTRACT_SC_CN_NAME = '汽车销售合同'
FSM_CONTRACT_SC_CLASSIFY = 53
CONTRACT_SET = {
CONTRACT_QRS_CLASSIFY,
CONTRACT_CLASSIFY,
HIL_CONTRACT_1_CLASSIFY,
HIL_CONTRACT_2_CLASSIFY,
HIL_CONTRACT_3_CLASSIFY,
FSM_CONTRACT_WEP_CLASSIFY,
FSM_CONTRACT_MSI_CLASSIFY,
FSM_CONTRACT_SC_CLASSIFY,
}
CONTRACT_MAP = {
HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME,
......@@ -1065,8 +1083,13 @@ CONTRACT_MAP = {
HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME,
CONTRACT_CLASSIFY: CONTRACT_CN_NAME,
CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME,
FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_CN_NAME,
FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_CN_NAME,
FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_CN_NAME,
}
FSM_CONTRACT_CLASSIFY_SET = {FSM_CONTRACT_WEP_CLASSIFY, FSM_CONTRACT_MSI_CLASSIFY, FSM_CONTRACT_SC_CLASSIFY}
# 保单
INSURANCE_CN_NAME = '保单'
INSURANCE_CLASSIFY = 42
......@@ -1214,6 +1237,11 @@ BS_FIELD = 'bss_ocr'
HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr'
HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr'
HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr'
FSM_CONTRACT_WEP_FIELD = 'fsm_wep_ocr'
FSM_CONTRACT_MSI_FIELD = 'fsm_msi_ocr'
FSM_CONTRACT_SC_FIELD = 'fsm_sc_ocr'
BS_CLASSIFY = 10089
RESULT_MAPPING = {
......@@ -1238,6 +1266,9 @@ RESULT_MAPPING = {
HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD,
HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD,
HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD,
FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_FIELD,
FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_FIELD,
FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_FIELD,
}
CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD)
......@@ -2313,29 +2344,42 @@ APPLICANT_TYPE_MAP = {
APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager']
FILE_NAME_PREFIX_MAP = {
AFC_PREFIX: [
((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'),
((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
],
HIL_PREFIX: [
((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'),
((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'),
((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
]
}
# FILE_NAME_PREFIX_MAP = {
# AFC_PREFIX: [
# ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'),
# ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
# ],
# HIL_PREFIX: [
# ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'),
# ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'),
# ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
# ]
# }
ECONTRACT_KEYWORDS_MAP = {
AFC_PREFIX: [
('抵押贷款合同', CONTRACT_CLASSIFY),
('送达地址确认书', CONTRACT_QRS_CLASSIFY),
# ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0),
('抵押登记豁免函', HMH_CLASSIFY),
],
HIL_PREFIX: [
('售后回租合同', HIL_CONTRACT_1_CLASSIFY),
('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY),
('车辆处置协议', HIL_CONTRACT_3_CLASSIFY),
# ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0),
('抵押登记豁免函', HMH_CLASSIFY),
]
}
FSM_ECONTRACT_KEYWORDS_MAP = {
AFC_PREFIX: [
('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY),
('长悦保养套餐服务合约', FSM_CONTRACT_MSI_CLASSIFY),
('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
],
HIL_PREFIX: [
('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY),
('长悦保养套餐服务合同', FSM_CONTRACT_MSI_CLASSIFY),
('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
]
}
......@@ -2345,6 +2389,12 @@ HIL_CONTRACT_TYPE_MAP = {
str(HIL_CONTRACT_3_CLASSIFY): 1,
}
FSM_CONTRACT_TYPE_MAP = {
str(FSM_CONTRACT_WEP_CLASSIFY): 0,
str(FSM_CONTRACT_MSI_CLASSIFY): 1,
str(FSM_CONTRACT_SC_CLASSIFY): 2,
}
RESULT_MAP = {
0: None,
1: True,
......
......@@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g
from common.tools.pdf_to_img import PDFHandler
from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict
from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict
from common.fsm_econtract.fsm_contract_ocr import predict as fsm_predict
from common.fsm_econtract.hmh_ocr import predict as hmh_predict
from apps.doc import consts
# from apps.doc.ocr.edms import EDMS, rh
from apps.doc.ocr.ecm import ECM, rh
......@@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin):
res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get(
consts.ALL_POSITION_KEY, {}).get(key1, [])
license_summary[classify] = [res]
else:
elif classify in consts.SE_HIL_CON_MAP: # TODO FSM新合同写入数据库用于比对
res = {}
for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items():
if pno1 is None:
......@@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin):
self.log_base, traceback.format_exc()))
error_list.append(1)
return
else: # e-contract
else: # e-contract or or e-fsm-contract or e-hmh
try:
# pdf下载 处理 图片存储 识别
for times in range(consts.RETRY_TIMES):
......@@ -1472,8 +1474,10 @@ class Command(BaseCommand, LoggerMixin):
self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
# AFC合同
if classify_1_str == str(consts.CONTRACT_CLASSIFY):
ocr_result = afc_predict(pdf_handler.pdf_info)
is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3]
ocr_result = afc_predict(pdf_handler.pdf_info, is_fsm=is_fsm)
page_res = {}
for page_num, page_info in ocr_result.get('page_info', {}).items():
if isinstance(page_num, str) and page_num.startswith('page_'):
......@@ -1483,6 +1487,7 @@ class Command(BaseCommand, LoggerMixin):
'page_num': page_num,
'page_info': page_info
}
# 送达地址确认书
elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY):
ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True)
page_num = 'page_1'
......@@ -1493,9 +1498,11 @@ class Command(BaseCommand, LoggerMixin):
'page_info': ocr_result.pop(page_num, {})
}
}
else:
# HIL合同
elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP:
is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3]
file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str)
ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1)
ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1, is_fsm=is_fsm)
rebuild_res_1 = {}
page_res = {}
for field_name, field_info in ocr_result_1.items():
......@@ -1508,28 +1515,55 @@ class Command(BaseCommand, LoggerMixin):
'page_num': page_num,
'page_info': page_info
}
# FSM合同 WEP MSI SC
elif classify_1_str in consts.FSM_CONTRACT_TYPE_MAP:
file_type = consts.FSM_CONTRACT_TYPE_MAP.get(classify_1_str)
ocr_result = fsm_predict(pdf_handler.pdf_info, file_type)
page_res = {}
for page_num, page_info in ocr_result.items():
if isinstance(page_num, str) and page_num.startswith('page_'):
page_res[page_num] = {
'classify': int(classify_1_str),
'page_num': page_num,
'page_info': page_info
}
# hmh
# else:
# pass
contract_res = {}
for img_path_tmp, page_key in pdf_handler.img_path_pno_list:
if page_key in page_res:
if classify_1_str == str(consts.HMH_CLASSIFY):
img_contract_res = {
'code': 1,
'data': [
{
'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY),
'data': page_res[page_key]
}
]
}
'code': 1,
'data': [
{
'classify': consts.HMH_CLASSIFY,
'data': hmh_predict(pdf_handler.pdf_info)
}
]
}
else:
img_contract_res = {
'code': 1,
'data': [
{
'classify': int(classify_1_str),
}
]
}
if page_key in page_res:
img_contract_res = {
'code': 1,
'data': [
{
'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY),
'data': page_res[page_key]
}
]
}
else:
img_contract_res = {
'code': 1,
'data': [
{
'classify': int(classify_1_str),
}
]
}
contract_res[img_path_tmp] = img_contract_res
with lock:
......
......@@ -36,6 +36,7 @@ class RequestTrigger(NamedEnum):
DOCUPLOAD = (3, 'Document Upload')
SUBMITING = (4, 'Submiting')
UPLOADING = (5, 'Uploading')
OVP = (6, 'OVP')
class FailureReason(NamedEnum):
......
......@@ -780,10 +780,12 @@ class BSWorkbook(Workbook):
if field_str is not None:
count_list.append((field_str, count))
def contract_rebuild(self, contract_result_dict):
def contract_rebuild(self, contract_result_dict, is_ca=False):
for classify, contract_result in contract_result_dict.items():
if len(contract_result) == 0:
continue
if is_ca and classify not in consts.FSM_CONTRACT_CLASSIFY_SET:
continue
ws = self.create_sheet(consts.CONTRACT_MAP.get(classify))
for i in range(30):
if str(i) in contract_result:
......@@ -906,6 +908,7 @@ class BSWorkbook(Workbook):
else:
self.bs_rebuild(bs_summary, res_count_tuple, metadata)
self.license_rebuild(license_summary, document_scheme, count_list)
self.contract_rebuild(contract_result, True)
self.move_res_sheet()
self.remove_base_sheet()
return count_list
......
......@@ -602,13 +602,22 @@ class UploadDocView(GenericView, DocHandler):
is_zip = False
classify_1 = 0
# 电子合同
if data_source == consts.DATA_SOURCE_LIST[-1] and document_scheme == consts.DOC_SCHEME_LIST[1]:
for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix):
# 电子合同 Econtract or OVP(FSM)
if data_source == consts.DATA_SOURCE_LIST[2] or data_source == consts.DATA_SOURCE_LIST[3]:
if document_scheme == consts.DOC_SCHEME_LIST[1]:
for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix):
if keyword in document_name:
classify_1 = classify_1_tmp
break
# FSM合同:WEP/MSI/SC
elif data_source == consts.DATA_SOURCE_LIST[0] and document_scheme == consts.DOC_SCHEME_LIST[0]:
for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix):
if keyword in document_name:
classify_1 = classify_1_tmp
break
elif document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
break
if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
or document_name.endswith('.RAR'):
is_zip = True
......
......@@ -6,6 +6,7 @@
# @Description :
from .get_char import Finder
from .get_char_fsm import Finder as FSMFinder
import numpy as np
......@@ -23,7 +24,7 @@ def extract_info(ocr_results):
return {'page_1': {'合同编号': contract_no}}
def predict(pdf_info, is_qrs=False):
def predict(pdf_info, is_qrs=False, is_fsm=False):
ocr_results = {}
for pno in pdf_info:
ocr_results[pno] = {}
......@@ -50,7 +51,10 @@ def predict(pdf_info, is_qrs=False):
results = extract_info(ocr_results)
else:
# 输入是整个 PDF 中的信息
f = Finder(pdf_info, ocr_results=ocr_results)
if is_fsm:
f = FSMFinder(pdf_info, ocr_results=ocr_results)
else:
f = Finder(pdf_info, ocr_results=ocr_results)
results = f.get_info()
return results
......
import re
import numpy as np
from fuzzywuzzy import fuzz
from shapely.geometry import Polygon
class Finder:
def __init__(self, pdf_info, ocr_results):
self.pdf_info = pdf_info
self.ocr_results = ocr_results
self.is_asp = False
self.item = {"words": None,
"position": None,
}
def gen_init_result(self, is_asp):
# 格式化算法输出
self.init_result = {"page_1": {"合同编号": self.item,
"所购车辆价格": self.item,
"车架号": self.item,
"贷款本金金额": {"大写": self.item,
"小写": self.item,
"车辆贷款本金金额": self.item,
"附加产品融资贷款本金总金额": self.item,
},
"贷款期限": self.item,
"附加产品融资贷款本金总金额明细": self.item,
"借款人签字及时间": self.item,
},
"page_2": {"合同编号": self.item,
"借款人及抵押人": {"name": self.item,
"id": self.item,
},
"共同借款人及共同抵押人": {"name": self.item,
"id": self.item,
},
"保证人1": {"name": self.item,
"id": self.item,
},
"保证人2": {"name": self.item,
"id": self.item,
},
"所购车辆价格": self.item,
"车架号": self.item,
"经销商": self.item,
"贷款本金金额": {"大写": self.item,
"小写": self.item,
"车辆贷款本金金额": self.item,
"附加产品融资贷款本金总金额": self.item,
},
"贷款期限": self.item,
"标准利率": self.item,
"借款人收款账户": {"账号": self.item,
"户名": self.item,
"开户行": self.item,
},
"还款账户": {"账号": self.item,
"户名": self.item,
"开户行": self.item,
},
},
"page_3": {"合同编号": self.item,
"还款计划表": self.item,
},
"page_4": {"合同编号": self.item,
"附加产品融资贷款本金总金额明细": self.item,
},
"page_5": {"合同编号": self.item,
},
"page_6": {"合同编号": self.item,
},
}
self.init_result["page_7"] = {"合同编号": self.item,
}
self.init_result["page_8"] = {"合同编号": self.item,
"主借人签字": {"签字": self.item,
"日期": self.item,
},
"共借人签字": {"签字": self.item,
"日期": self.item,
},
"保证人1签字": {"签字": self.item,
"日期": self.item,
},
"保证人2签字": {"签字": self.item,
"日期": self.item,
},
"见证人签字": {"签字": self.item,
"日期": self.item,
},
}
def get_top_iou(self, poly, ocr_result):
"""传入一个多边形, 找到与之最匹配的多边形
Args:
poly (TYPE): Description
"""
iou_list = []
for key in ocr_result:
bbox, text = ocr_result[key]
g = Polygon(np.array(bbox).reshape((-1, 2)))
p = Polygon(np.array(poly).reshape((-1, 2)))
if not g.is_valid or not p.is_valid:
continue
inter = Polygon(g).intersection(Polygon(p)).area
union = g.area + p.area - inter
iou = inter/union
iou_list.append([iou, key])
if len(iou_list) == 0:
return -1, -1
top_iou = sorted(iou_list, key=lambda x: x[0])[-1]
return top_iou
def poly_to_rectangle(self, poly):
xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly
bbox = [xmin, ymin, xmax, ymax]
return bbox
def get_contract_no(self, page_num):
"""传入页码,查看该页码右上角的编号
Args:
page_num (string):
Returns:
sting:
"""
contract_no = self.item.copy()
# contract_no['words'] = ''
# contract_no['position'] = [-1, -1, -1, -1]
# 只看第一页
for key in self.ocr_results[page_num]:
bbox, text = self.ocr_results[page_num][key]
if '合同编号:' in text:
words = text.split(':')[-1]
location = self.poly_to_rectangle(bbox)
contract_no['words'] = words
contract_no['position'] = location
return contract_no
def get_vehicle_price(self, page_num='0'):
vehicle_price = self.item.copy()
# vehicle_price['words'] = ''
# vehicle_price['position'] = [-1, -1, -1, -1]
for key in self.ocr_results[page_num]:
bbox, text = self.ocr_results[page_num][key]
if '所购车辆价格为人民币' in text:
words = text.split('币')[-1]
location = self.poly_to_rectangle(bbox)
vehicle_price['words'] = words
vehicle_price['position'] = location
return vehicle_price
def get_vin(self, page_num='0'):
vin = self.item.copy()
# vin['words'] = ''
# vin['position'] = [-1, -1, -1, -1]
for key in self.ocr_results[page_num]:
bbox, text = self.ocr_results[page_num][key]
if '车架号:' in text:
words = text.split(':')[-1]
location = self.poly_to_rectangle(bbox)
vin['words'] = words
vin['position'] = location
return vin
def get_loan_principal(self, page_num='0'):
chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾',
'佰', '仟', '万', '亿', '元', '角', '分', '零', '整']
upper = self.item.copy()
lower = self.item.copy()
asp_1 = self.item.copy()
asp_2 = self.item.copy()
anchor_bbox = None
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if fuzz.ratio(''.join(chinese_keywords), text) > 15:
text = text.split(':')[-1].strip()
upper['position'] = bbox
upper['words'] = text
if '小写:¥' in text:
words = text.split('¥')[-1].strip()
lower['position'] = bbox
lower['words'] = words
if '附加产品融资贷款本金总金额' == text:
anchor_bbox = bbox
if anchor_bbox:
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
asp_1['position'] = bbox
asp_1['words'] = words
if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
asp_2['position'] = bbox
asp_2['words'] = words
return upper, lower, asp_1, asp_2
def get_loan_term(self, page_num='0'):
loan_term = self.item.copy()
all_text = ''
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
all_text += text
matchs = re.search(r'贷款期限(\d+)个月', all_text)
if matchs:
words = matchs.group(1)
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if f'{words}个月' in text:
loan_term['position'] = bbox
loan_term['words'] = words
return loan_term
def get_standard_rate(self, page_num='0'):
standard_rate = self.item.copy()
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
matchs = re.search(r'本合同当期的标准利率为(\S+)%/年', text)
if matchs:
standard_rate['position'] = bbox
standard_rate['words'] = matchs.group(1)
return standard_rate
def mergelist(self, text_list):
pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
mergeindex = -1
for index, i in enumerate(text_list):
if '所购' in i and len(pattern.sub('', pattern.sub('', text_list[index+1]))) != 0:
# if '所购' in i and '.00' not in text_list[index+1]:
mergeindex = index
if mergeindex == -1:
return text_list
else:
new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex+1]] + text_list[mergeindex+2:]
return self.mergelist(new_text_list)
def get_asp_details(self, page_num):
asp_details_table_term = self.item.copy()
asp_details_table = [['附加产品融资贷款本金总金额及贷款利率明细'], ['项目1', '用途总金额2', '贷款本金3']]
bbox_xm = None
bbox_ytzje = None
bbox_dkbj = None
bbox_total = None
for key in self.ocr_results[page_num]:
bbox, text = self.ocr_results[page_num][key]
if text == '项目1':
bbox_xm = bbox
if text == '用途总金额2':
bbox_ytzje = bbox
if text == '贷款本金3':
bbox_dkbj = bbox
if text in ['附加产品融资贷款本', '附加产品融资贷款本金', '附加产品融资贷']:
bbox_total = bbox
if bbox_xm:
for i in range(10):
rh = abs(bbox_xm[1]-bbox_xm[-1])
anchor = np.array(bbox_xm).reshape((-1 ,2))
anchor[:, 1] += int(rh*1.4)
_iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num])
if _iou > 0:
bbox, xm_text = self.ocr_results[page_num][_key]
bbox_xm = bbox
# 解决项目内容是两行的问题
if not '所购' in xm_text:
line = asp_details_table[-1]
line[0] += xm_text
asp_details_table[-1] = line
continue
# print(xm_text)
anchor_1 = [bbox_ytzje[0], bbox[1], bbox_ytzje[2], bbox[3],
bbox_ytzje[4], bbox[5], bbox_ytzje[6], bbox[7]]
_iou, _key = self.get_top_iou(poly=anchor_1, ocr_result=self.ocr_results[page_num])
bbox, ytzje_text = self.ocr_results[page_num][_key]
# print(ytzje_text)
anchor_2 = [bbox_dkbj[0], bbox[1], bbox_dkbj[2], bbox[3],
bbox_dkbj[4], bbox[5], bbox_dkbj[6], bbox[7]]
_iou, _key = self.get_top_iou(poly=anchor_2, ocr_result=self.ocr_results[page_num])
bbox, dkbj_text = self.ocr_results[page_num][_key]
# print(dkbj_text)
if xm_text == ytzje_text:
xm_text, ytzje_text = xm_text.split(' ')
line = [xm_text, ytzje_text, dkbj_text]
asp_details_table.append(line)
else:
break
if bbox_total:
anchor = [bbox_dkbj[0], bbox_total[1], bbox_dkbj[2], bbox_total[3],
bbox_dkbj[4], bbox_total[5], bbox_dkbj[6], bbox_total[7]]
_iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num])
bbox, total_text = self.ocr_results[page_num][_key]
asp_details_table.append(['附加产品融资贷款本金总金额:', '', total_text])
asp_details_table_term['words'] = asp_details_table
return asp_details_table_term
def get_signature(self):
signature = self.item.copy()
for block in self.pdf_info['0']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '签署日期' in text:
words = text
signature['words'] = words
signature['position'] = bbox
return signature
def get_somebody(self, top, bottom):
# 指定上下边界后,返回上下边界内的客户信息
_name = self.item.copy()
_id = self.item.copy()
# 只看第一页,先划定上下边界
y_top = 0
y_bottom = 0
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if top in text:
y_top = bbox[3]
if bottom in text:
y_bottom = bbox[3]
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if y_top < bbox[3] < y_bottom:
# print(top, bottom, text)
if '姓名/名称' in text:
words = text.split(':')[-1]
_name['position'] = bbox
_name['words'] = words
if '自然人身份证件号码/法人执照号码' in text:
words = text.split(':')[-1]
_id['position'] = bbox
_id['words'] = words
return _name, _id
def get_seller(self):
seller = self.item.copy()
# 先找到 key
anchor_bbox = None
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if text in ['经销商', '车辆销售方']:
anchor_bbox = bbox
# 当找到了 key, 则根据 key 去匹配 value
if anchor_bbox:
half_width = self.pdf_info['1']['width'] * 0.5
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \
anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]:
seller['position'] = bbox
seller['words'] = text
return seller
def get_borrower_collection_account(self):
account = self.item.copy()
account_name = self.item.copy()
account_bank = self.item.copy()
all_text = ''
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
all_text += text
# 首先确定账户信息是哪种,我们只输出非另行通知的格式
if '借款人收款账户' in all_text:
all_text = all_text.replace(' ', '').replace(' ', '')
matchs_1 = re.findall(r'账号:(.*?)户名', all_text)
if matchs_1:
words = matchs_1[0]
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if f'{words}' in text:
account['position'] = bbox
account['words'] = words
matchs_2 = re.findall(r'户名:(.*?)开户行', all_text)
if matchs_2:
words = matchs_2[0]
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if f'{words}' in text:
account_name['position'] = bbox
account_name['words'] = words
matchs_3 = re.findall(r'开户行:(.*?)借款人', all_text)
if matchs_3:
words = matchs_3[0]
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if f'{words}' in text:
account_bank['position'] = bbox
account_bank['words'] = words
return account, account_name, account_bank
def get_payback_account(self):
account = self.item.copy()
account_name = self.item.copy()
account_bank = self.item.copy()
all_text = ''
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
all_text += text
# 首先确定账户信息是哪种,我们只输出非另行通知的格式
if '(13) 还款账户' in all_text:
all_text = all_text.split('(13) 还款账户')[-1]
all_text = all_text.replace(' ', '').replace(' ', '')
matchs_1 = re.findall(r'账号:(.*?)户名', all_text)
if matchs_1:
words = matchs_1[0]
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if f'{words}' in text:
account['position'] = bbox
account['words'] = words
matchs_2 = re.findall(r'户名:(.*?)开户行', all_text)
if matchs_2:
words = matchs_2[0]
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if f'{words}' in text:
account_name['position'] = bbox
account_name['words'] = words
matchs_3 = re.findall(r'开户行:(.*?);', all_text)
if matchs_3:
words = matchs_3[0]
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if f'开户行:{words};' in text.replace(' ', ''):
account_bank['position'] = bbox
account_bank['words'] = words
return account, account_name, account_bank
def get_repayment_schedule(self):
repayment_schedule = self.item.copy()
# 只看第二页
repayment_schedule_table = []
repayment_schedule_text_list = []
table = False
for block in self.pdf_info['2']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '序号' == text:
table = True
if '以上表格中所列的序号并非还款期数' in text:
table = False
if table == True:
repayment_schedule_text_list.append(text)
for i in range(len(repayment_schedule_text_list)//5):
line = []
# 5表示5列的意思
for j in range(5):
line.append(repayment_schedule_text_list[i*5+j])
if str(i+1) == line[1]:
break
repayment_schedule_table.append(line)
if len(repayment_schedule_table) > 0:
repayment_schedule['words'] = repayment_schedule_table
return repayment_schedule
def get_signature_role_1(self):
signature_role_1 = self.init_item.copy()
# 先定位签字区域
texts = []
boxes = []
page_num = None
position = None
words = None
region = False
for i in list(self.pdf_info.keys()):
for block in self.pdf_info[i]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '借款人(抵押人)' in text:
region = True
if '日期' in text:
region = False
if region == True:
page_num = i
texts.append(text)
boxes.append(bbox)
if len(texts) > 4:
words = '有'
else:
words = '无'
boxes = np.array(boxes).reshape((-1, 2))
position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
signature_role_1['page_num'] = page_num
signature_role_1['position'] = position
signature_role_1['words'] = words
return signature_role_1
def get_signature_role_2(self):
signature_role_2 = self.init_item.copy()
# 先定位签字区域
texts = []
boxes = []
page_num = None
position = None
words = None
region = False
for i in list(self.pdf_info.keys()):
for block in self.pdf_info[i]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '共同借款人(共同抵押人)' in text:
region = True
if '日期' in text:
region = False
if region == True:
page_num = i
texts.append(text)
boxes.append(bbox)
if len(texts) > 4:
words = '有'
else:
words = '无'
boxes = np.array(boxes).reshape((-1, 2))
position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
signature_role_2['page_num'] = page_num
signature_role_2['position'] = position
signature_role_2['words'] = words
return signature_role_2
def get_signature_role_3(self):
signature_role_3 = self.init_item.copy()
# 先定位签字区域
texts = []
boxes = []
page_num = None
position = None
words = None
region = False
for i in list(self.pdf_info.keys()):
for block in self.pdf_info[i]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '保证人1' in text and int(i) != 0:
region = True
if '日期' in text:
region = False
if region == True:
page_num = i
texts.append(text)
boxes.append(bbox)
if len(texts) > 4:
words = '有'
else:
words = '无'
boxes = np.array(boxes).reshape((-1, 2))
position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
signature_role_3['page_num'] = page_num
signature_role_3['position'] = position
signature_role_3['words'] = words
return signature_role_3
def get_signature_role_4(self):
signature_role_4 = self.init_item.copy()
# 先定位签字区域
texts = []
boxes = []
page_num = None
position = None
words = None
region = False
for i in list(self.pdf_info.keys()):
for block in self.pdf_info[i]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '保证人2' in text and int(i) != 0:
region = True
if '日期' in text:
region = False
if region == True:
page_num = i
texts.append(text)
boxes.append(bbox)
if len(texts) > 4:
words = '有'
else:
words = '无'
boxes = np.array(boxes).reshape((-1, 2))
position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
signature_role_4['page_num'] = page_num
signature_role_4['position'] = position
signature_role_4['words'] = words
return signature_role_4
def get_signature_role_5(self):
signature_role_5 = self.init_item.copy()
# 先定位签字区域
texts = []
boxes = []
page_num = None
position = None
words = None
region = False
for i in list(self.pdf_info.keys()):
for block in self.pdf_info[i]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '见证人签字' in text and int(i) != 0:
region = True
if '年' in text:
region = False
if region == True:
page_num = i
texts.append(text)
boxes.append(bbox)
print(texts)
if len(texts) > 4:
words = '有'
else:
words = '无'
boxes = np.array(boxes).reshape((-1, 2))
position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
signature_role_5['page_num'] = page_num
signature_role_5['position'] = position
signature_role_5['words'] = words
return signature_role_5
def get_last_page_signature(self, page_num, top, bottom):
signature_name = self.item.copy()
signature_date = self.item.copy()
anchor_top = None
anchor_bottom = None
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if top in text:
anchor_top = bbox[1]
if bottom in text:
anchor_bottom = bbox[1]
# print(top, anchor_top, anchor_bottom)
if anchor_top is not None and anchor_bottom is not None:
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
name = text.split(' ')[0]
date = text.split(':')[-1]
signature_name['words'] = name
signature_name['position'] = bbox
signature_date['words'] = date
signature_date['position'] = bbox
return signature_name, signature_date
def get_info(self):
"""
block['type'] == 0 : 表示该元素为图片
Returns:
dict: Description
"""
# 先判断是否为 ASP 产品
# 只看第一页,判断是否有 '附加产品融资贷款本金总金额' 这一句话,若有则为 ASP 产品
# print(self.pdf_info['0']['blocks'])
# for block in self.pdf_info['0']['blocks']:
# if block['type'] != 0:
# continue
# for line in block['lines']:
# for span in line['spans']:
# bbox, text = span['bbox'], span['text']
# if '附加产品融资贷款本金总金额' == text:
# self.is_asp = True
for key in self.ocr_results['0']:
bbox, text = self.ocr_results['0'][key]
if '附加产品融资贷款本金总金额' in text:
self.is_asp = True
self.gen_init_result(self.is_asp)
if len(list(self.ocr_results.keys())) <= 8: # 8.5 版本客户提供的样本出现串页的情况,暂时无法识别
# Page 1
# 找合同编号
contract_no = self.get_contract_no(page_num='0')
# print(contract_no)
self.init_result['page_1']['合同编号'] = contract_no
# 所购车辆价格
vehicle_price = self.get_vehicle_price()
# print(vehicle_price)
self.init_result['page_1']['所购车辆价格'] = vehicle_price
# 车架号
vin = self.get_vin()
# print(vin)
self.init_result['page_1']['车架号'] = vin
# 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
upper, lower, asp_1, asp_2 = self.get_loan_principal()
# print(upper, lower, asp_1, asp_2)
self.init_result['page_1']['贷款本金金额']['大写'] = upper
self.init_result['page_1']['贷款本金金额']['小写'] = lower
self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1
self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
# 贷款期限
loan_term = self.get_loan_term()
# print(loan_term)
self.init_result['page_1']['贷款期限'] = loan_term
# 附加产品融资贷款本金总金额明细(ASP-表格)
asp_details_table = self.get_asp_details(page_num='0')
# print(asp_details_table)
self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table
# 借款人签字及时间
signature = self.get_signature()
# print(signature)
self.init_result['page_1']['借款人签字及时间'] = signature
#######################################
# Page 2
# 找合同编号
contract_no = self.get_contract_no(page_num='0')
# print(contract_no)
self.init_result['page_2']['合同编号'] = contract_no
# 找借款人及抵押人(地址字段原本有空格)
borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人:')
# 这是为了同时兼容 8.1 版本
if borrower_name['words'] == None:
borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:')
# 这是为了兼容车贷分离版本
if borrower_name['words'] == None:
borrower_name, borrower_id = self.get_somebody(top='借款人:', bottom='共同借款人及抵押人:')
# print(borrower_name, borrower_id)
self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name
self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id
# 找共同借款人及共同抵押人
co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人:', bottom='保证人1:')
# print(co_borrower_name, co_borrower_id)
self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name
self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id
# 保证人1
first_guarantor_name, first_guarantor_id = self.get_somebody(top='保证人1:', bottom='保证人2:')
self.init_result['page_2']['保证人1']['name'] = first_guarantor_name
self.init_result['page_2']['保证人1']['id'] = first_guarantor_id
# 保证人2
second_guarantor_name, second_guarantor_id = self.get_somebody(top='保证人2:', bottom='第一章')
self.init_result['page_2']['保证人2']['name'] = second_guarantor_name
self.init_result['page_2']['保证人2']['id'] = second_guarantor_id
# 所购车辆价格
vehicle_price = self.get_vehicle_price(page_num='1')
# print(vehicle_price)
self.init_result['page_2']['所购车辆价格'] = vehicle_price
# 车架号
vin = self.get_vin(page_num='1')
# print(vin)
self.init_result['page_2']['车架号'] = vin
# 经销商
seller = self.get_seller()
# print(seller)
self.init_result['page_2']['经销商'] = seller
# 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1')
# print(upper, lower, asp_1, asp_2)
self.init_result['page_2']['贷款本金金额']['大写'] = upper
self.init_result['page_2']['贷款本金金额']['小写'] = lower
self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1
self.init_result['page_2']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
# 贷款期限
loan_term = self.get_loan_term(page_num='1')
# print(loan_term)
self.init_result['page_2']['贷款期限'] = loan_term
# 本合同当期的标准利率
standard_rate = self.get_standard_rate(page_num='1')
# print(standard_rate)
self.init_result['page_2']['标准利率'] = standard_rate
# 202212 release 新增借款人收款账户
account, account_name, account_bank = self.get_borrower_collection_account()
# print(account, account_name, account_bank)
self.init_result['page_2']['借款人收款账户']['账号'] = account
self.init_result['page_2']['借款人收款账户']['户名'] = account_name
self.init_result['page_2']['借款人收款账户']['开户行'] = account_bank
# 还款账户
account, account_name, account_bank = self.get_payback_account()
# print(account, account_name, account_bank)
self.init_result['page_2']['还款账户']['账号'] = account
self.init_result['page_2']['还款账户']['户名'] = account_name
self.init_result['page_2']['还款账户']['开户行'] = account_bank
#######################################
# Page 3
# 找合同编号
contract_no = self.get_contract_no(page_num='2')
self.init_result['page_3']['合同编号'] = contract_no
# 还款计划表(表格)
repayment_schedule_table = self.get_repayment_schedule()
# print(repayment_schedule_table)
self.init_result['page_3']['还款计划表'] = repayment_schedule_table
#######################################
# Page 4
# 找合同编号
contract_no = self.get_contract_no(page_num='3')
# print(contract_no)
self.init_result['page_4']['合同编号'] = contract_no
# 附加产品融资贷款本金总金额明细(ASP-表格)
asp_details_table = self.get_asp_details(page_num='3')
# print(asp_details_table)
self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table
#######################################
# Page 5
# 找合同编号
contract_no = self.get_contract_no(page_num='4')
# print(contract_no)
self.init_result['page_5']['合同编号'] = contract_no
#######################################
# Page 6
# 找合同编号
contract_no = self.get_contract_no(page_num='5')
# print(contract_no)
self.init_result['page_6']['合同编号'] = contract_no
# Page 7
# 找合同编号
contract_no = self.get_contract_no(page_num='6')
self.init_result['page_7']['合同编号'] = contract_no
# Page 8
# 找合同编号
contract_no = self.get_contract_no(page_num='7')
self.init_result['page_8']['合同编号'] = contract_no
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='合同编号', bottom='共同借款人')
if signature_name['words'] == None:
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='合同编号', bottom='共同借款人(抵押人)')
# print(signature_name, signature_date)
self.init_result['page_8']['主借人签字']['签字'] = signature_name
self.init_result['page_8']['主借人签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='共同借款人', bottom='保证人1')
if signature_name['words'] == None:
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='共同借款人(抵押人)', bottom='保证人1')
# print(signature_name, signature_date)
self.init_result['page_8']['共借人签字']['签字'] = signature_name
self.init_result['page_8']['共借人签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='保证人1', bottom='保证人2')
self.init_result['page_8']['保证人1签字']['签字'] = signature_name
self.init_result['page_8']['保证人1签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='保证人2', bottom='在本人面前亲笔签署本合同')
self.init_result['page_8']['保证人2签字']['签字'] = signature_name
self.init_result['page_8']['保证人2签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='在本人面前亲笔签署本合同', bottom='以下无正文')
# print(signature_name, signature_date)
self.init_result['page_8']['见证人签字']['签字'] = signature_name
self.init_result['page_8']['见证人签字']['日期'] = signature_date
# 重新定制输出
new_results = {"is_asp": self.is_asp,
"page_info": self.init_result
}
return new_results
\ No newline at end of file
import re
import numpy as np
from fuzzywuzzy import fuzz
from shapely.geometry import Polygon
def caculate_iou(g, p):
g = Polygon(np.array(g).reshape((-1, 2)))
p = Polygon(np.array(p).reshape((-1, 2)))
inter = Polygon(g).intersection(Polygon(p)).area
union = g.area + p.area - inter
return inter/union
def get_table_info(bbox_1, bbox_2, ocr_result):
anchor = [bbox_2[0], bbox_1[1], bbox_2[2], bbox_1[3],
bbox_2[4], bbox_1[5], bbox_2[6], bbox_1[7]]
table_info = ''
for span in ocr_result:
iou = caculate_iou(anchor, span[0])
if iou > 0:
table_info = span[1]
return table_info
class Finder:
def __init__(self, pdf_info):
self.pdf_info = pdf_info
self.item = {"words": None,
"page": None,
"position": None,
}
# 格式化算法输出
self.init_result = {"合同编号": self.item,
"承租人-姓名": self.item,
"承租人-证件号码": self.item,
"承租人-法定代表人或授权代表": self.item,
"共同承租人-姓名": self.item,
"共同承租人-证件号码": self.item,
"共同承租人-法定代表人或授权代表": self.item,
"保证人1-姓名": self.item,
"保证人1-证件号码": self.item,
"保证人1-法定代表人或授权代表": self.item,
"保证人2-姓名": self.item,
"保证人2-证件号码": self.item,
"保证人2-法定代表人或授权代表": self.item,
"保证人3-姓名": self.item,
"保证人3-证件号码": self.item,
"保证人3-法定代表人或授权代表": self.item,
"合同编号(正文)": self.item,
"车辆识别代码": self.item,
"车辆卖方(经销商)": self.item,
"车辆原始销售价格(《机动车销售统一发票》所列金额)": self.item,
"车辆附加产品明细表": self.item,
"融资成本总额": self.item,
"租期": self.item,
"付款计划表": self.item,
"承租人收款账户-户名": self.item,
"承租人收款账户-银行账号": self.item,
"承租人收款账户-开户行": self.item,
"承租人扣款账户-户名": self.item,
"承租人扣款账户-银行账号": self.item,
"承租人扣款账户-开户行": self.item,
"签字页-承租人姓名": self.item,
"签字页-承租人签章": self.item,
"签字页-共同承租人姓名": self.item,
"签字页-共同承租人签章": self.item,
"签字页-保证人1姓名": self.item,
"签字页-保证人1签章": self.item,
"签字页-保证人2姓名": self.item,
"签字页-保证人2签章": self.item,
"签字页-保证人3姓名": self.item,
"签字页-保证人3签章": self.item,
}
# 格式化输出 车辆处置协议 要是别的字段
self.init_result_1 = {"合同编号": self.item,
"承租人-姓名": self.item,
"承租人-证件号码": self.item,
"销售经销商": self.item,
"合同编号(正文)": self.item,
"签字页-承租人姓名": self.item,
"签字页-承租人证件号码": self.item,
"签字页-承租人签章": self.item,
"签字页-销售经销商": self.item,
"签字页-销售经销商签章": self.item,
}
# 格式化输出 车辆租赁抵押合同
self.init_result_2 = {"合同编号": self.item,
"合同编号(正文)": self.item,
"抵押人姓名/名称": self.item,
"抵押人证件号码": self.item,
"抵押人配偶姓名/名称": self.item,
"抵押人配偶证件号码": self.item,
"车辆识别代码": self.item,
"租金总额": self.item,
"融资租赁期限": self.item,
"签字页-抵押人姓名": self.item,
"签字页-抵押人签章": self.item,
"签字页-抵押人配偶姓名": self.item,
"签字页-抵押人配偶签章": self.item,
}
def get_contract_no(self, page_num):
"""传入页码,查看该页码右上角的编号
Args:
page_num (string):
Returns:
sting:
"""
contract_no = self.item.copy()
# 只看第一页
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '合同编号:' in text:
words = text.split(':')[-1]
contract_no['position'] = bbox
contract_no['page'] = page_num
contract_no['words'] = words
if contract_no['words'] == '':
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if bbox[1] < contract_no['position'][3] and 'CH' in text:
contract_no['position'] = bbox
contract_no['page'] = page_num
contract_no['words'] = text
return contract_no
def get_vehicle_price(self, page_num='0'):
vehicle_price = self.item.copy()
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '所购车辆价格为人民币' in text:
words = text.split('币')[-1]
vehicle_price['position'] = bbox
vehicle_price['words'] = words
return vehicle_price
def get_contract_no_one(self):
# 查找正文中的合同编号,有可能存在换行的情况
contract_no = self.item.copy()
for pno in self.pdf_info:
all_text = ''
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
all_text += text
all_text = all_text.replace(' ', '')
matchObj = re.search(r'(合同编号:\[(.*?)\])', all_text)
if matchObj:
words = matchObj.group(1)
contract_no['position'] = None
contract_no['page'] = pno
# contract_no['words'] = words
contract_no['words'] = re.sub("\s", "", words).replace(")", "")
return contract_no
matchObj = re.search(r'编号为(.*?)的', all_text)
if matchObj:
words = matchObj.group(1).strip()
contract_no['position'] = None
contract_no['page'] = pno
# contract_no['words'] = words
contract_no['words'] = re.sub("\s", "", words).replace(")", "")
return contract_no
matchObj = re.search(r'编号为(.*?))的', all_text)
if matchObj:
words = matchObj.group(1).strip()
contract_no['position'] = None
contract_no['page'] = pno
# contract_no['words'] = words
contract_no['words'] = re.sub("\s", "", words)
return contract_no
def get_key_value(self, key, page_num=None):
value = self.item.copy()
if page_num is not None:
pno = page_num
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if key in text:
words = text.split(':')[-1].replace("。", "")
value['position'] = bbox
value['page'] = pno
# value['words'] = words
value['words'] = re.sub("\s", "", words)
else:
for pno in self.pdf_info:
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if key in text:
# print(self.pdf_info[pno])
words = text.split(':')[-1].replace("。", "")
value['position'] = bbox
value['page'] = pno
# value['words'] = words
value['words'] = re.sub("\s", "", words)
return value
def get_loan_principal(self, page_num='0'):
chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾',
'佰', '仟', '万', '亿', '元', '角', '分', '零', '整']
upper = self.item.copy()
lower = self.item.copy()
asp_1 = self.item.copy()
asp_2 = self.item.copy()
anchor_bbox = None
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if fuzz.ratio(''.join(chinese_keywords), text) > 15:
text = text.split(':')[-1].strip()
upper['position'] = bbox
upper['words'] = text
if '小写:¥' in text:
words = text.split('¥')[-1].strip()
lower['position'] = bbox
lower['words'] = words
if '附加产品融资贷款本金总金额' == text:
anchor_bbox = bbox
if anchor_bbox:
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
asp_1['position'] = bbox
asp_1['words'] = words
if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
asp_2['position'] = bbox
asp_2['words'] = words
return upper, lower, asp_1, asp_2
def get_loan_term(self, page_num='0'):
loan_term = self.item.copy()
all_text = ''
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
all_text += text
matchs = re.search(r'贷款期限(\d+)个月', all_text)
if matchs:
words = matchs.group(1)
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if f'{words}个月' in text:
loan_term['position'] = bbox
loan_term['words'] = words
return loan_term
def get_asp_details(self, page_num):
asp_details_table_term = self.item.copy()
asp_details_table = []
asp_details_text_list = []
table = False
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '附加产品融资贷款本金总金额明细' == text:
table = True
if '第二条' in text or '征信管理' in text:
table = False
if table == True:
asp_details_text_list.append(text)
for i in range((len(asp_details_text_list)+2)//3):
line = []
if i == 0:
line = [asp_details_text_list[0]]
else:
for j in range(3):
line.append(asp_details_text_list[i*3-2+j])
asp_details_table.append(line)
if len(asp_details_table) > 0:
asp_details_table_term['words'] = asp_details_table
return asp_details_table_term
def get_signature(self):
signature = self.item.copy()
for block in self.pdf_info['0']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '签署日期' in text:
words = text
signature['words'] = words
signature['position'] = bbox
return signature
def get_somebody(self, top, bottom):
# 指定上下边界后,返回上下边界内的客户信息
_name = self.item.copy()
_id = self.item.copy()
# 只看第一页,先划定上下边界
y_top = 0
y_bottom = 0
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if top in text:
y_top = bbox[3]
if bottom in text:
y_bottom = bbox[3]
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if y_top < bbox[3] < y_bottom:
if '姓名/名称' in text:
words = text.split(':')[-1]
_name['position'] = bbox
_name['words'] = words
if '自然人身份证件号码/法人执照号码' in text:
words = text.split(':')[-1]
_id['position'] = bbox
_id['words'] = words
return _name, _id
def get_seller(self):
seller = self.item.copy()
# 先找到 key
anchor_bbox = None
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '经销商' == text:
anchor_bbox = bbox
# 当找到了 key, 则根据 key 去匹配 value
if anchor_bbox:
half_width = self.pdf_info['1']['width'] * 0.5
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \
anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]:
seller['position'] = bbox
seller['words'] = text
return seller
def get_payback_account(self):
account = self.item.copy()
account_name = self.item.copy()
account_bank = self.item.copy()
all_text = ''
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
all_text += text
# 首先确定账户信息是哪种,我们只输出非另行通知的格式
if '☑账号' in all_text:
all_text = all_text.replace(' ', '')
matchs_1 = re.findall(r'账号:(.*)户名', all_text)
if matchs_1:
words = matchs_1[0]
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if f'{words}' in text:
account['position'] = bbox
account['words'] = words
matchs_2 = re.findall(r'户名:(.*)开户行', all_text)
if matchs_2:
words = matchs_2[0]
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if f'{words}' in text:
account_name['position'] = bbox
account_name['words'] = words
matchs_3 = re.findall(r'开户行:(.*);', all_text)
if matchs_3:
words = matchs_3[0]
for block in self.pdf_info['1']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if f'开户行:{words};' in text.replace(' ', ''):
account_bank['position'] = bbox
account_bank['words'] = words
return account, account_name, account_bank
def get_repayment_schedule(self):
repayment_schedule = self.item.copy()
repayment_schedule_text_list = []
table = False
page = None
left = 0
right = 0
for pno in self.pdf_info:
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '剩余融资' in text:
right = bbox[2]
if '以上表格中所列序号' in text:
table = False
if table == True:
# 过滤汉字
if re.compile(r'[\u4e00-\u9fff]').search(text):
continue
# 过滤 1. - 61. 这些标题
if re.findall("\d+", text):
if len(re.findall("\d+", text)) == 1:
continue
if not left < bbox[0] < right:
continue
repayment_schedule_text_list.append(text)
if text.strip() == "61.":
page = pno
table = True
left = bbox[0]
# print("repayment_schedule_text_list = ", repayment_schedule_text_list)
# repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']]
repayment_schedule_table = [['序号', '租金']]
for i in range(len(repayment_schedule_text_list)//4):
line = [f'{i+1}.']
# 4表示4列的意思
for j in range(4):
line.append(repayment_schedule_text_list[i*4+j])
# 只保留序号和租金列
line = [line[0].replace('.', ''), line[3]]
repayment_schedule_table.append(line)
repayment_schedule['words'] = repayment_schedule_table
repayment_schedule['page'] = page
return repayment_schedule
def get_signature_role_1(self):
signature_role_1 = self.item.copy()
for pno in self.pdf_info:
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '签署日期' in text:
signature_role_1['position'] = bbox
signature_role_1['page'] = pno
signature_role_1['words'] = text
return signature_role_1
def get_signature_role_2(self):
signature_role_2 = self.init_item.copy()
# 先定位签字区域
texts = []
boxes = []
page_num = None
position = None
words = None
region = False
for i in list(self.pdf_info.keys()):
for block in self.pdf_info[i]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '共同借款人(共同抵押人)' in text:
region = True
if '日期' in text:
region = False
if region == True:
page_num = i
texts.append(text)
boxes.append(bbox)
if len(texts) > 4:
words = '有'
else:
words = '无'
boxes = np.array(boxes).reshape((-1, 2))
position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
signature_role_2['page_num'] = page_num
signature_role_2['position'] = position
signature_role_2['words'] = words
return signature_role_2
def get_signature_role_3(self):
signature_role_3 = self.init_item.copy()
# 先定位签字区域
texts = []
boxes = []
page_num = None
position = None
words = None
region = False
for i in list(self.pdf_info.keys()):
for block in self.pdf_info[i]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '保证人1' in text and int(i) != 0:
region = True
if '日期' in text:
region = False
if region == True:
page_num = i
texts.append(text)
boxes.append(bbox)
if len(texts) > 4:
words = '有'
else:
words = '无'
boxes = np.array(boxes).reshape((-1, 2))
position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
signature_role_3['page_num'] = page_num
signature_role_3['position'] = position
signature_role_3['words'] = words
return signature_role_3
def get_signature_role_4(self):
signature_role_4 = self.init_item.copy()
# 先定位签字区域
texts = []
boxes = []
page_num = None
position = None
words = None
region = False
for i in list(self.pdf_info.keys()):
for block in self.pdf_info[i]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '保证人2' in text and int(i) != 0:
region = True
if '日期' in text:
region = False
if region == True:
page_num = i
texts.append(text)
boxes.append(bbox)
if len(texts) > 4:
words = '有'
else:
words = '无'
boxes = np.array(boxes).reshape((-1, 2))
position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
signature_role_4['page_num'] = page_num
signature_role_4['position'] = position
signature_role_4['words'] = words
return signature_role_4
def get_signature_role_5(self):
signature_role_5 = self.init_item.copy()
# 先定位签字区域
texts = []
boxes = []
page_num = None
position = None
words = None
region = False
for i in list(self.pdf_info.keys()):
for block in self.pdf_info[i]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '见证人签字' in text and int(i) != 0:
region = True
if '年' in text:
region = False
if region == True:
page_num = i
texts.append(text)
boxes.append(bbox)
# print(texts)
if len(texts) > 4:
words = '有'
else:
words = '无'
boxes = np.array(boxes).reshape((-1, 2))
position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
signature_role_5['page_num'] = page_num
signature_role_5['position'] = position
signature_role_5['words'] = words
return signature_role_5
def get_last_page_signature(self, page_num, top, bottom):
signature_name = self.item.copy()
signature_date = self.item.copy()
anchor_top = None
anchor_bottom = None
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if top in text:
anchor_top = bbox[1]
if bottom in text:
anchor_bottom = bbox[1]
if anchor_top is not None and anchor_bottom is not None:
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
name = text.split(' ')[0]
date = text.split(':')[-1]
signature_name['words'] = name
signature_name['position'] = bbox
signature_date['words'] = date
signature_name['position'] = bbox
return signature_name, signature_date
def get_electronic_signature(self, top, bottom):
signature = self.item.copy()
anchor_top = None
anchor_bottom = None
for pno in self.pdf_info:
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if top in text:
anchor_top = bbox[1]
if bottom in text:
anchor_bottom = bbox[3]
if anchor_top is not None and anchor_bottom is not None:
for pno in self.pdf_info:
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
# ------------ #
# print("--text = ", text)
if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
words = text
signature['words'] = words
signature['page'] = pno
signature['position'] = bbox
return signature
def get_role_info(self, role_key, page_num='0'):
name = self.item.copy()
id_num = self.item.copy()
representative = self.item.copy()
# 以保证人3 的左上角为定位点
anchor = None
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
# 找到角色姓名
if re.match('保证人3', text) is not None:
anchor = [bbox[0], bbox[1]]
if anchor is not None:
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
# 找到角色姓名
if re.match(role_key, text) is not None:
words = text.split(':')[-1]
name['words'] = words
name['page'] = page_num
name['position'] = bbox
if role_key == '承租人:':
# 找到证件号码且确定位置
if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
words = text.split(':')[-1]
id_num['words'] = words
id_num['page'] = page_num
id_num['position'] = bbox
# 找到法人代表且确定位置
if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
words = text.split(':')[-1]
representative['words'] = words
representative['page'] = page_num
representative['position'] = bbox
if role_key == '保证人1:':
# 找到证件号码且确定位置
if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
words = text.split(':')[-1]
id_num['words'] = words
id_num['page'] = page_num
id_num['position'] = bbox
# 找到法人代表且确定位置
if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
words = text.split(':')[-1]
representative['words'] = words
representative['page'] = page_num
representative['position'] = bbox
if role_key == '保证人2:':
# 找到证件号码且确定位置
if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
words = text.split(':')[-1]
id_num['words'] = words
id_num['page'] = page_num
id_num['position'] = bbox
# 找到法人代表且确定位置
if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
words = text.split(':')[-1]
representative['words'] = words
representative['page'] = page_num
representative['position'] = bbox
if role_key == '保证人3:':
# 找到证件号码且确定位置
if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
words = text.split(':')[-1]
id_num['words'] = words
id_num['page'] = page_num
id_num['position'] = bbox
# 找到法人代表且确定位置
if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
words = text.split(':')[-1]
representative['words'] = words
representative['page'] = page_num
representative['position'] = bbox
return name, id_num, representative
def get_table_add_product(self):
table_add_product = self.item.copy()
add_product_page_num = None
for pno in self.pdf_info:
for block in self.pdf_info[f'{pno}']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '车辆附加产品(明细见下表)' in text:
add_product_page_num = pno
ocr_results = []
for block in self.pdf_info[f'{add_product_page_num}']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
xmin, ymin, xmax, ymax = bbox
bbox = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax]
ocr_results.append([bbox, text])
lines = [['项目', '购买价格', '实际融资金额']]
key_xm = None
key_gmjg = None
key_sjrzje = None
key_total = None
for index, span in enumerate(ocr_results):
if span[1] == '项目':
key_xm = index
if span[1] == '购买价格':
key_gmjg = index
if span[1] == '实际融资金额':
key_sjrzje = index
if span[1] == '总计':
key_total = index
bbox, text = ocr_results[key_xm]
rh = abs(bbox[1]-bbox[-1])
anchor = np.array(bbox).reshape((-1, 2))
anchor[:, 0] += 2*rh
anchor[:, 1] += rh
for i in range(5):
for span in ocr_results:
iou = caculate_iou(anchor, span[0])
if iou > 0.01 and span[1].strip() != '所购':
x = get_table_info(span[0], ocr_results[key_gmjg][0], ocr_results)
y = get_table_info(span[0], ocr_results[key_sjrzje][0], ocr_results)
line = [span[1].replace('\u3000', ' '), x, y]
# print(line)
lines.append(line)
anchor = np.array(span[0]).reshape((-1, 2))
anchor[:, 1] += rh
total = get_table_info(ocr_results[key_total][0], ocr_results[key_sjrzje][0], ocr_results)
lines.append(['总计', '', total])
# 所购 BMW悦然焕
# 新服务
# 所购 BMW5年10
# 万公里长悦保养套餐
# 所购 事故维修补偿
# 方案
# 所购 BMW5年10万公里
# 长悦保养套餐
# 所购 MINI4年6万公里长悦
# 保养套餐
filtered_lines = []
for line in lines:
if line[0][:2] not in ['所购', '项目', '总计']:
continue
if 'BMW悦然' in line[0]:
line[0] = '所购 BMW悦然焕新服务'
if 'BMW5年10' in line[0]:
line[0] = '所购 BMW5年10万公里长悦保养套餐'
if '事故维修补' in line[0]:
line[0] = '所购 事故维修补偿方案'
if 'MINI4年6万公里长悦' in line[0]:
line[0] = '所购 MINI4年6万公里长悦保养套餐'
filtered_lines.append(line)
table_add_product['words'] = filtered_lines
table_add_product['page'] = add_product_page_num
table_add_product['position'] = None
return table_add_product
def get_contract_no_dy(self):
# 查找抵押合同编号
contract_no = self.item.copy()
key_box = None
for pno in self.pdf_info:
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '抵押合同编号' in text:
key_box = bbox
if key_box is not None:
for pno in self.pdf_info:
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and 'CH-' in text:
contract_no['position'] = bbox
contract_no['page'] = pno
contract_no['words'] = text
return contract_no
def get_dyr_name_id(self):
name = self.item.copy()
_id = self.item.copy()
key_box = None
for pno in self.pdf_info:
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if text == '抵押人':
key_box = bbox
if key_box is not None:
rh = abs(key_box[1]-key_box[3])
for pno in self.pdf_info:
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text:
words = text.split(':')[-1]
name['position'] = bbox
name['page'] = pno
name['words'] = words
if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text:
words = text.split(':')[-1]
_id['position'] = bbox
_id['page'] = pno
_id['words'] = words
return name, _id
def get_dyrpo_name_id(self):
name = self.item.copy()
_id = self.item.copy()
key_box = None
for pno in self.pdf_info:
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if text == '抵押人配偶(如适':
key_box = bbox
if key_box is not None:
rh = abs(key_box[1]-key_box[3])
for pno in self.pdf_info:
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text:
words = text.split(':')[-1]
name['position'] = bbox
name['page'] = pno
name['words'] = words
if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text:
words = text.split(':')[-1]
_id['position'] = bbox
_id['page'] = pno
_id['words'] = words
return name, _id
def get_key_value_position(self, key):
value = self.item.copy()
key_box = None
for pno in self.pdf_info:
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if text == key:
key_box = bbox
if key_box is not None:
rh = abs(key_box[1]-key_box[3])
for pno in self.pdf_info:
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and key_box[0] < bbox[0] and abs(key_box[2]-bbox[0]) < rh*10:
words = text
value['position'] = bbox
value['page'] = pno
value['words'] = words
return value
def get_role_info_3_3(self, role_key, page_num='0'):
name = self.item.copy()
id_num = self.item.copy()
representative = self.item.copy()
# 以保证人2 的左上角为定位点
anchor = None
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
# 找到角色姓名
if re.match('保证人2', text) is not None:
anchor = [bbox[0], bbox[1]]
if anchor is not None:
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
# 找到角色姓名
if re.match(role_key, text) is not None:
words = text.split(':')[-1]
name['words'] = words
name['page'] = page_num
name['position'] = bbox
if role_key == '承租人一:':
# 找到证件号码且确定位置
if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
words = text.split(':')[-1]
id_num['words'] = words
id_num['page'] = page_num
id_num['position'] = bbox
# 找到法人代表且确定位置
if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
words = text.split(':')[-1]
representative['words'] = words
representative['page'] = page_num
representative['position'] = bbox
if role_key == '共同承租人:':
# 找到证件号码且确定位置
if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
words = text.split(':')[-1]
id_num['words'] = words
id_num['page'] = page_num
id_num['position'] = bbox
# 找到法人代表且确定位置
if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
words = text.split(':')[-1]
representative['words'] = words
representative['page'] = page_num
representative['position'] = bbox
if role_key == '保证人1:':
# 找到证件号码且确定位置
if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
words = text.split(':')[-1]
id_num['words'] = words
id_num['page'] = page_num
id_num['position'] = bbox
# 找到法人代表且确定位置
if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
words = text.split(':')[-1]
representative['words'] = words
representative['page'] = page_num
representative['position'] = bbox
if role_key == '保证人2:':
# 找到证件号码且确定位置
if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
words = text.split(':')[-1]
id_num['words'] = words
id_num['page'] = page_num
id_num['position'] = bbox
# 找到法人代表且确定位置
if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
words = text.split(':')[-1]
representative['words'] = words
representative['page'] = page_num
representative['position'] = bbox
return name, id_num, representative
def get_value_by_findall(self, prefix, suffix, page_num):
value = self.item.copy()
all_text = ''
pno = page_num
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
all_text += text
words_list = re.findall(f"{prefix}(.*?){suffix}", all_text)
if len(words_list) > 0:
for block in self.pdf_info[pno]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if words_list[0] in text:
value['position'] = bbox
value['page'] = pno
value['words'] = words_list[0]
return value
def get_info(self):
"""
block['type'] == 0 : 表示该元素为图片
Returns:
dict: Description
"""
if len(self.pdf_info) > 0:
# 取 Page 1 上的合同编号
contract_no = self.get_contract_no(page_num='0')
self.init_result['合同编号'] = contract_no
# 粗略判断是否是 ‘车贷分离版本’ 的合同
is_cdfl = False
for block in self.pdf_info['0']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '共同承租人:' in text:
is_cdfl = True
if is_cdfl == False:
# 从第一页上取四个角色的姓名和证件号码
name, id_num, representative = self.get_role_info(role_key='承租人:', page_num='0')
if name["words"] == None:
name, id_num, representative = self.get_role_info_3_3(role_key='承租人一:', page_num='0')
self.init_result['承租人-姓名'] = name
self.init_result['承租人-证件号码'] = id_num
self.init_result['承租人-法定代表人或授权代表'] = representative
name, id_num, representative = self.get_role_info(role_key='保证人1:', page_num='0')
self.init_result['保证人1-姓名'] = name
self.init_result['保证人1-证件号码'] = id_num
self.init_result['保证人1-法定代表人或授权代表'] = representative
# if条件判别 对应3_3版本
if name["words"] == None:
name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人:', page_num='0')
self.init_result['共同承租人-姓名'] = name
self.init_result['共同承租人-证件号码'] = id_num
self.init_result['共同承租人-法定代表人或授权代表'] = representative
name, id_num, representative = self.get_role_info(role_key='保证人2:', page_num='0')
self.init_result['保证人2-姓名'] = name
self.init_result['保证人2-证件号码'] = id_num
self.init_result['保证人2-法定代表人或授权代表'] = representative
# if条件判别 对应3_3版本
if name["words"] == None:
name, id_num, representative = self.get_role_info_3_3(role_key='保证人1:', page_num='0')
self.init_result['保证人2-姓名'] = name
self.init_result['保证人2-证件号码'] = id_num
self.init_result['保证人2-法定代表人或授权代表'] = representative
name, id_num, representative = self.get_role_info(role_key='保证人3:', page_num='0')
self.init_result['保证人3-姓名'] = name
self.init_result['保证人3-证件号码'] = id_num
self.init_result['保证人3-法定代表人或授权代表'] = representative
if name["words"] == None:
name, id_num, representative = self.get_role_info_3_3(role_key='保证人2:', page_num='0')
self.init_result['保证人3-姓名'] = name
self.init_result['保证人3-证件号码'] = id_num
self.init_result['保证人3-法定代表人或授权代表'] = representative
else:
name, id_num, representative = self.get_role_info_3_3(role_key='承租人一:', page_num='0')
self.init_result['承租人-姓名'] = name
self.init_result['承租人-证件号码'] = id_num
self.init_result['承租人-法定代表人或授权代表'] = representative
name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人:', page_num='0')
self.init_result['共同承租人-姓名'] = name
self.init_result['共同承租人-证件号码'] = id_num
self.init_result['共同承租人-法定代表人或授权代表'] = representative
name, id_num, representative = self.get_role_info_3_3(role_key='保证人1:', page_num='0')
self.init_result['保证人1-姓名'] = name
self.init_result['保证人1-证件号码'] = id_num
self.init_result['保证人1-法定代表人或授权代表'] = representative
name, id_num, representative = self.get_role_info_3_3(role_key='保证人2:', page_num='0')
self.init_result['保证人2-姓名'] = name
self.init_result['保证人2-证件号码'] = id_num
self.init_result['保证人2-法定代表人或授权代表'] = representative
# 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出
contract_no = self.get_contract_no_one()
self.init_result['合同编号(正文)'] = contract_no
# 找到车辆识别代码
vin = self.get_key_value(key='车辆识别代码:')
self.init_result['车辆识别代码'] = vin
# 找到经销商(车辆卖方(经销商))
seller = self.get_key_value(key='车辆卖方(经销商):')
if seller['words'] == None:
seller = self.get_key_value(key='车辆卖方:')
self.init_result['车辆卖方(经销商)'] = seller
# 找到 —— 车辆原始销售价格
vehicle_price = self.get_key_value(key='车辆原始销售价格(《机动车销售统一发票》所列金额):')
self.init_result['车辆原始销售价格(《机动车销售统一发票》所列金额)'] = vehicle_price
# 找车辆附加产品明细(表)
table_add_product = self.get_table_add_product()
self.init_result['车辆附加产品明细表'] = table_add_product
# 找融资成本总额
financing_cost = self.get_key_value(key='融资成本总额:')
self.init_result['融资成本总额'] = financing_cost
# 找租期
lease_term = self.get_key_value(key='租期:')
self.init_result['租期'] = lease_term
# 找还款计划(表)
repayment_schedule = self.get_repayment_schedule()
self.init_result['付款计划表'] = repayment_schedule
# 找承租人收款账户户名、银行账号、银行
name = self.get_key_value(key='户名:', page_num='4')
self.init_result['承租人收款账户-户名'] = name
account = self.get_key_value(key='银行账号:', page_num='4')
self.init_result['承租人收款账户-银行账号'] = account
bank = self.get_key_value(key='开户银行:', page_num='4')
self.init_result['承租人收款账户-开户行'] = bank
# 找承租人扣款账户户名、银行账号、银行
name = self.get_key_value(key='户名:', page_num='5')
self.init_result['承租人扣款账户-户名'] = name
account = self.get_key_value(key='银行账号:', page_num='5')
self.init_result['承租人扣款账户-银行账号'] = account
bank = self.get_key_value(key='开户银行:', page_num='5')
self.init_result['承租人扣款账户-开户行'] = bank
# 找签字页上的系列信息
# 承租人姓名、签章
if is_cdfl == False:
name = self.get_key_value(key='承租人姓名:')
electronic_signature = self.get_electronic_signature(top='承租人姓名:', bottom='保证人1姓名:')
if name["words"] == None:
name = self.get_key_value(key='承租人一姓名:')
electronic_signature = self.get_electronic_signature(top='承租人一姓名:', bottom='共同承租人名称:')
self.init_result['签字页-承租人姓名'] = name
self.init_result['签字页-承租人签章'] = electronic_signature
# 保证人1姓名、签章
name = self.get_key_value(key='保证人1姓名:')
electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:')
self.init_result['签字页-保证人1姓名'] = name
self.init_result['签字页-保证人1签章'] = electronic_signature
# 这里用的是 name["words"] == ""
if name["words"] == "":
name = self.get_key_value(key='共同承租人名称:')
electronic_signature = self.get_electronic_signature(top='共同承租人名称:', bottom='保证人1姓名:')
self.init_result['签字页-共同承租人姓名'] = name
self.init_result['签字页-共同承租人签章'] = electronic_signature
# 保证人2姓名、签章
name = self.get_key_value(key='保证人2姓名:')
electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:')
self.init_result['签字页-保证人2姓名'] = name
self.init_result['签字页-保证人2签章'] = electronic_signature
# if判断条件对应3_3版本
if name["words"] == "":
name = self.get_key_value(key='保证人1姓名:')
electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:')
self.init_result['签字页-保证人1姓名'] = name
self.init_result['签字页-保证人1签章'] = electronic_signature
# 保证人3姓名、签章
name = self.get_key_value(key='保证人3姓名:')
electronic_signature = self.get_electronic_signature(top='保证人3姓名:', bottom='日期:')
self.init_result['签字页-保证人3姓名'] = name
self.init_result['签字页-保证人3签章'] = electronic_signature
# if判断条件对应3_3版本
if name["words"] == None:
name = self.get_key_value(key='保证人2姓名:')
electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='日期:')
self.init_result['签字页-保证人2姓名'] = name
self.init_result['签字页-保证人2签章'] = electronic_signature
else:
name = self.get_key_value(key='承租人一姓名:')
electronic_signature = self.get_electronic_signature(top='承租人一姓名:', bottom='共同承租人名称:')
self.init_result['签字页-承租人姓名'] = name
self.init_result['签字页-承租人签章'] = electronic_signature
name = self.get_key_value(key='共同承租人名称:')
electronic_signature = self.get_electronic_signature(top='共同承租人名称:', bottom='保证人1姓名:')
self.init_result['签字页-共同承租人姓名'] = name
self.init_result['签字页-共同承租人签章'] = electronic_signature
name = self.get_key_value(key='保证人1姓名:')
electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:')
self.init_result['签字页-保证人1姓名'] = name
self.init_result['签字页-保证人1签章'] = electronic_signature
name = self.get_key_value(key='保证人2姓名:')
electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:')
self.init_result['签字页-保证人2姓名'] = name
self.init_result['签字页-保证人2签章'] = electronic_signature
return self.init_result
def get_info_1(self):
if len(self.pdf_info) > 0:
contract_no = self.get_contract_no(page_num='0')
self.init_result_1['合同编号'] = contract_no
# 承租人姓名
name = self.get_key_value(key='承租人:', page_num='0')
self.init_result_1['承租人-姓名'] = name
# 承租人证件号码
_id = self.get_key_value(key='证件号码:', page_num='0')
self.init_result_1['承租人-证件号码'] = _id
# 销售经销商
seller = self.get_key_value(key='销售经销商:', page_num='0')
if seller['words'] == "":
seller = self.get_value_by_findall('销售经销商:', '地址:', page_num='0')
self.init_result_1['销售经销商'] = seller
# 合同编号(正文)
contract_no = self.get_contract_no_one()
self.init_result_1['合同编号(正文)'] = contract_no
# 签字页-承租人姓名
name = self.get_key_value(key='姓名/名称:')
self.init_result_1['签字页-承租人姓名'] = name
# 签字页-承租人证件号码
_id = self.get_key_value(key='自然人身份证件号码/法人执照号码:')
self.init_result_1['签字页-承租人证件号码'] = _id
# 签字页-承租人签章
signature_role_1 = self.get_signature_role_1()
self.init_result_1['签字页-承租人签章'] = signature_role_1
# 签字页-销售经销商
seller = self.get_key_value(key='销售经销商:')
if seller['words'] == "":
# 销售经销商:深圳市宝创汽车贸易有限公司南山分公司(请授权代表签字并请盖章)
seller = self.get_value_by_findall('销售经销商:', '(请授权代表签字并请盖章)', page_num='3')
self.init_result_1['签字页-销售经销商'] = seller
# 经销商签章
pass
return self.init_result_1
def get_info_2(self):
if len(self.pdf_info) > 0:
contract_no = self.get_contract_no_dy()
self.init_result_2['合同编号'] = contract_no
# 合同编号(正文)
contract_no = self.get_contract_no_one()
self.init_result_2['合同编号(正文)'] = contract_no
# 抵押人姓名/名称
name, _id = self.get_dyr_name_id()
self.init_result_2['抵押人姓名/名称'] = name
self.init_result_2['抵押人证件号码'] = _id
# 抵押人配偶信息
name, _id = self.get_dyrpo_name_id()
self.init_result_2['抵押人配偶姓名/名称'] = name
self.init_result_2['抵押人配偶证件号码'] = _id
# 车辆识别代码
vin = self.get_key_value(key='车辆识别代码:')
self.init_result_2['车辆识别代码'] = vin
# 租金总额
rent = self.get_key_value_position(key='租金总额')
self.init_result_2['租金总额'] = rent
# 融资租赁期限
lease_term = self.get_key_value_position(key='融资租赁期限')
self.init_result_2['融资租赁期限'] = lease_term
# 签字页抵押人姓名和签章
name = self.get_key_value(key='抵押人姓名:')
electronic_signature = self.get_electronic_signature(top='抵押权人盖章', bottom='抵押人配偶姓名:')
self.init_result_2['签字页-抵押人姓名'] = name
self.init_result_2['签字页-抵押人签章'] = electronic_signature
# 签字页抵押人配偶姓名和签章
name = self.get_key_value(key='抵押人配偶姓名:')
electronic_signature = self.get_electronic_signature(top='抵押人配偶姓名:', bottom='日期')
self.init_result_2['签字页-抵押人配偶姓名'] = name
self.init_result_2['签字页-抵押人配偶签章'] = electronic_signature
return self.init_result_2
\ No newline at end of file
......@@ -6,9 +6,10 @@
# @Description :
from .get_char import Finder
from .get_char_fsm import Finder as FSMFinder
def predict(pdf_info, file_cls):
def predict(pdf_info, file_cls, is_fsm=False):
"""Summary
Args:
......@@ -58,7 +59,11 @@ def predict(pdf_info, file_cls):
pdf_info = dict()
for pno, page_info in enumerate(pdf_info_1):
pdf_info[str(pno)] = page_info
f = Finder(pdf_info)
if is_fsm:
f = FSMFinder(pdf_info)
else:
f = Finder(pdf_info)
if file_cls == 0:
results = f.get_info()
if file_cls == 1:
......
WEP_FIELD = {
"0": {
'keys': {
'客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})],
'证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
'证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
'合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})],
'客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})],
'签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
},
'value': {
'客户姓名': ('text', 'right', {'offset_tuple': (-1.1, 1, 0.3, 0)}, ''),
'证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''),
'证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''),
'合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''),
'客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'),
'签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
},
}
}
MSI_FIELD = {
"0": {
'keys': {
'客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})],
'证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
'证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
'合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})],
},
'value': {
'客户姓名': ('text', 'right', {'offset_tuple': (-1.2, 1, 0.3, 0)}, ''),
'证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''),
'证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''),
'合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''),
},
},
"1": {
'keys': {
'客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})],
'签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
},
'value': {
'客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'),
'签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
},
}
}
SC_FIELD = {
"0": {
'keys': {
'姓名': [('姓名', (r'^姓名.?$', r'^企业名称.?$'), 'top1', {})],
'证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
'证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
'总价': [('总价', (r'^总价.?$', ), 'top1', {})],
},
'value': {
'姓名': ('text', 'right', {'offset_tuple': (-2, 8, 0.5, 0)}, ''),
'证件类型': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''),
'证件号码': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''),
'总价': ('text', 'right', {'offset_tuple': (-2, 12, 0.5, 0)}, ''),
},
},
"-1": {
'keys': {
'客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', r'^客户签名/盖章.*$'), 'top1', {})],
'签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
},
'value': {
'客户签名': ('img', 'under', {'offset_tuple': (1.5, 1, 0, 4), 'rigorous': True}, '无'),
'签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
},
}
}
from .retriever import Retriever
from .const import WEP_FIELD, MSI_FIELD, SC_FIELD
from .tools import pdf_info_rebuild
retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD)]
def predict(pdf_info, file_type=0):
retriever = retriever_list[file_type]
pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info)
return retriever.get_target_fields(pdf_text_list, pdf_img_list)
from .retriever import HMHRetriever
from .tools import pdf_info_rebuild
hmh_retriever = HMHRetriever()
def predict(pdf_info):
pdf_text_list, _ = pdf_info_rebuild(pdf_info, fix_bbox=False)
return hmh_retriever.get_target_fields(pdf_text_list)
import re
class HMHRetriever:
def __init__(self):
self.words_str = 'words'
self.position_str = 'location'
self.default_position = [0, 0, 0, 0]
self.search_fields_list = [
('借款/承租人姓名', ''),
('证件号码', ''),
('渠道', ''),
('合同编号', ''),
('借款人签字/盖章', '无'),
]
def get_target_fields(self, pdf_text_list):
result = dict()
is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False
for bbox, text in pdf_text_list.pop(str(0), []):
# print(text)
if not is_find_name_id_company:
name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', text)
for name_id_company_tuple in name_id_company_list:
if len(name_id_company_tuple) == 3:
result[self.search_fields_list[0][0]] = {
self.words_str: name_id_company_tuple[0].replace('\u3000', '').strip(),
self.position_str: bbox
}
result[self.search_fields_list[1][0]] = {
self.words_str: name_id_company_tuple[1].replace('\u3000', '').replace(')', '').replace(')', '').strip(),
self.position_str: bbox
}
result[self.search_fields_list[2][0]] = {
self.words_str: name_id_company_tuple[2],
self.position_str: bbox
}
is_find_name_id_company = True
break
if not is_find_application_no:
application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text)
if len(application_no_list) == 1:
result[self.search_fields_list[3][0]] = {
self.words_str: application_no_list[0],
self.position_str: bbox
}
is_find_application_no = True
if not is_find_name_date:
name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text)
for name_date_tuple in name_date_list:
if len(name_date_tuple) == 2:
result[self.search_fields_list[4][0]] = {
self.words_str: '{0} {1}'.format(name_date_tuple[0].replace('\u3000', '').strip(), name_date_tuple[1]),
self.position_str: bbox
}
is_find_name_date = True
break
for find_key, default_value in self.search_fields_list:
if find_key not in result:
result[find_key] = {
self.words_str: default_value,
self.position_str: self.default_position,
}
# simple_result = []
# for key, value_dict in result.items():
# simple_result.append((key, value_dict[self.words_str]))
# return simple_result
return {"words_result": result}
class Retriever:
def __init__(self, target_fields):
self.keys_str = 'keys'
self.value_str = 'value'
self.text_str = 'text'
self.words_str = 'words'
self.position_str = 'position'
self.default_position = [-1, -1, -1, -1]
self.target_fields = target_fields
self.replace_map = {
'int': {
'(': '0'
}
}
@staticmethod
def key_top1(coordinates_list, key_coordinates):
# 关键词查找方向:最上面
coordinates_list.sort(key=lambda x: x[1])
return coordinates_list[0]
def key_right(self, coordinates_list, key_coordinates, offset_tuple, rigorous=False):
# 关键词查找方向:右侧
if len(coordinates_list) == 1:
return coordinates_list[0]
# 没有上一层关键词的坐标时,返回最上面的坐标
if key_coordinates is None:
return self.key_top1(coordinates_list, key_coordinates)
x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
x_min_find, find_key_coordinates = None, None
for x0, y0, x1, y1 in coordinates_list:
if rigorous:
is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
else:
cent_x = x0 + ((x1 - x0) / 2)
cent_y = y0 + ((y1 - y0) / 2)
is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
if is_eligible:
if x_min_find is None or x0 < x_min_find:
x_min_find = x0
find_key_coordinates = (x0, y0, x1, y1)
if find_key_coordinates is None:
return self.key_top1(coordinates_list, key_coordinates)
else:
return find_key_coordinates
def value_right(self, search_list, key_coordinates, offset_tuple, value_type=None, rigorous=False):
# 字段值查找方向:右侧
x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
x_min_find, value, coordinates = None, None, None
for (x0, y0, x1, y1), text in search_list:
if rigorous:
is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
else:
cent_x = x0 + ((x1 - x0) / 2)
cent_y = y0 + ((y1 - y0) / 2)
is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
if is_eligible:
if x_min_find is None or x0 < x_min_find:
if len(text.strip()) > 0:
x_min_find = x0
value = text
coordinates = (x0, y0, x1, y1)
if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
return new_value, coordinates
return value, coordinates
def value_under(self, search_list, key_coordinates, offset_tuple, value_type=None, append=False, rigorous=False):
# 字段值查找方向:下方
x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
find_list = []
for (x0, y0, x1, y1), text in search_list:
if rigorous:
is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
else:
cent_x = x0 + ((x1 - x0) / 2)
cent_y = y0 + ((y1 - y0) / 2)
is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
if is_eligible:
if len(text.strip()) > 0:
find_list.append((x0, y0, x1, y1, text))
if len(find_list) == 0:
return None, None
else:
find_list.sort(key=lambda x: (x[1], x[0]))
coordinates = find_list[0][:-1]
if append:
value = ''.join([text for _, _, _, _, text in find_list])
else:
value = find_list[0][-1]
if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
return new_value, coordinates
return value, coordinates
@staticmethod
def get_target_bbox(key_coordinates, offset_tuple):
offset_xmin, offset_xmax, offset_ymin, offset_ymax = offset_tuple
width = key_coordinates[2] - key_coordinates[0]
height = key_coordinates[-1] - key_coordinates[1]
x_min = key_coordinates[0] - (width * offset_xmin) # -1
x_max = key_coordinates[2] + (width * offset_xmax)
y_min = key_coordinates[1] - (height * offset_ymin) # -1
y_max = key_coordinates[-1] + (height * offset_ymax)
return x_min, y_min, x_max, y_max
def get_target_fields(self, pdf_text_list, pdf_img_list):
pdf_result = dict()
for pno_str, fields_dict in self.target_fields.items():
if pno_str == '-1':
pno_int_list = [int(pno_str) for pno_str in pdf_text_list.keys()]
pno_str = str(max(pno_int_list))
# 搜索关键词
key_text_info = dict()
for key_text_list in fields_dict[self.keys_str].values():
for key_text, key_re_tuple, _, _ in key_text_list:
for (x0, y0, x1, y1), text in pdf_text_list.get(pno_str, []):
for key_re in key_re_tuple:
if re.match(key_re, text):
key_text_info.setdefault(key_text, list()).append((x0, y0, x1, y1))
# 搜索关键词
key_coordinates_info = dict()
for field, key_text_list in fields_dict[self.keys_str].items():
last_key_coordinates = None
for key_text, _, direction, kwargs in key_text_list:
if key_text not in key_text_info:
last_key_coordinates = None
continue
last_key_coordinates = getattr(self, 'key_{0}'.format(direction))(
key_text_info[key_text],
last_key_coordinates,
**kwargs)
key_coordinates_info[field] = last_key_coordinates
# 搜索字段值
page_result = dict()
for field, (source, direction, kwargs, default_value) in fields_dict[self.value_str].items():
if not isinstance(key_coordinates_info.get(field), tuple):
page_result[field] = {
self.words_str: default_value,
self.position_str: self.default_position,
}
continue
value, coordinates = getattr(self, 'value_{0}'.format(direction))(
pdf_text_list.get(pno_str, []) if source == self.text_str else pdf_img_list.get(pno_str, []),
key_coordinates_info[field],
**kwargs
)
if not isinstance(value, str):
page_result[field] = {
self.words_str: default_value,
self.position_str: self.default_position,
}
else:
page_result[field] = {
self.words_str: value,
self.position_str: list(coordinates),
}
pdf_result['page_{0}'.format(int(pno_str) + 1)] = page_result
return pdf_result
def pdf_info_rebuild(pdf_info, fix_bbox=True):
pdf_text_info = dict()
pdf_img_info = dict()
for pno_str, page_info in pdf_info.items():
text_set = set()
for block in page_info['blocks']:
if block['type'] == 0:
# text有重复的现象
text_set.clear()
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text'].strip()
if len(text) != 0 and text not in text_set:
text_set.add(text)
# bbox的高,不准
if fix_bbox and bbox[-1] - bbox[1] < span['size']:
bbox[-1] = bbox[-1] + span['size']
pdf_text_info.setdefault(pno_str, list()).append([bbox, text])
elif block['type'] == 1:
pdf_img_info.setdefault(pno_str, list()).append((block['bbox'], '有'))
return pdf_text_info, pdf_img_info
\ No newline at end of file
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!