fd556337 by 周伟奇

add contract 8.5

1 parent 2dc31fab
......@@ -2970,6 +2970,8 @@ def compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True
auto_obj = auto_class.objects.filter(application_id=application_id, on_off=True).first()
if auto_obj is not None:
auto_result = se_compare_auto(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, auto_obj)
else:
auto_result = None
full_result = se_compare(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, is_cms, auto_result)
......
......@@ -6,28 +6,35 @@
# @Description :
from .get_char import Finder
import numpy as np
def predict(pdf_info):
ocr_results = {}
for pno in pdf_info:
ocr_results[pno] = {}
ocr_result = []
for key, block in enumerate(pdf_info[pno]['blocks']):
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if len(text) == 0:
continue
# print(text)
xmin, ymin, xmax, ymax = bbox
polygon = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax]
polygon = np.array(polygon, dtype=np.int32).tolist()
text = text.replace(":", ":").replace(" ", "")
ocr_results[pno][key] = [polygon, text]
ocr_result.append([polygon, text])
ocr_result = sorted(ocr_result, key=lambda x: x[0][1], reverse=False) # 按 y0 从小到大排
keys = list(range(len(ocr_result)))
ocr_result = dict(zip(keys, ocr_result))
ocr_results[pno] = ocr_result
# 输入是整个 PDF 中的信息
f = Finder(pdf_info, ocr_results=ocr_results)
results = f.get_info()
return results
......
......@@ -8,6 +8,7 @@
import re
import numpy as np
from fuzzywuzzy import fuzz
from shapely.geometry import Polygon
class Finder:
......@@ -111,6 +112,28 @@ class Finder:
},
}
def get_top_iou(self, poly, ocr_result):
"""传入一个多边形, 找到与之最匹配的多边形
Args:
poly (TYPE): Description
"""
iou_list = []
for key in ocr_result:
bbox, text = ocr_result[key]
g = Polygon(np.array(bbox).reshape((-1, 2)))
p = Polygon(np.array(poly).reshape((-1, 2)))
if not g.is_valid or not p.is_valid:
continue
inter = Polygon(g).intersection(Polygon(p)).area
union = g.area + p.area - inter
iou = inter / union
iou_list.append([iou, key])
if len(iou_list) == 0:
return -1, -1
top_iou = sorted(iou_list, key=lambda x: x[0])[-1]
return top_iou
def poly_to_rectangle(self, poly):
xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly
bbox = [xmin, ymin, xmax, ymax]
......@@ -253,38 +276,67 @@ class Finder:
if mergeindex == -1:
return text_list
else:
new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex + 1]] + \
text_list[mergeindex + 2:]
new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex + 1]] + text_list[
mergeindex + 2:]
return self.mergelist(new_text_list)
def get_asp_details(self, page_num):
asp_details_table_term = self.item.copy()
asp_details_table = []
asp_details_text_list = []
table = False
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '附加产品融资贷款本金总金额明细' == text:
table = True
if '第二条' in text or '征信管理' in text:
table = False
if table == True:
# print(text)
asp_details_text_list.append(text)
asp_details_text_list = self.mergelist(asp_details_text_list)
for i in range((len(asp_details_text_list) + 2) // 3):
line = []
if i == 0:
line = [asp_details_text_list[0]]
else:
for j in range(3):
line.append(asp_details_text_list[i * 3 - 2 + j])
asp_details_table.append(line)
if len(asp_details_table) > 0:
asp_details_table = [['附加产品融资贷款本金总金额及贷款利率明细'], ['项目1', '用途总金额2', '贷款本金3']]
bbox_xm = None
bbox_ytzje = None
bbox_dkbj = None
bbox_total = None
for key in self.ocr_results[page_num]:
bbox, text = self.ocr_results[page_num][key]
if text == '项目1':
bbox_xm = bbox
if text == '用途总金额2':
bbox_ytzje = bbox
if text == '贷款本金3':
bbox_dkbj = bbox
if text == '附加产品融资贷款本':
bbox_total = bbox
# print(bbox_xm, bbox_ytzje, bbox_dkbj, bbox_total)
if bbox_xm:
for i in range(10):
rh = abs(bbox_xm[1] - bbox_xm[-1])
anchor = np.array(bbox_xm).reshape((-1, 2))
anchor[:, 1] += int(rh * 1.4)
_iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num])
if _iou > 0:
bbox, xm_text = self.ocr_results[page_num][_key]
bbox_xm = bbox
# 解决项目内容是两行的问题
if not '所购' in xm_text:
line = asp_details_table[-1]
line[0] += xm_text
asp_details_table[-1] = line
continue
# print(xm_text)
anchor_1 = [bbox_ytzje[0], bbox[1], bbox_ytzje[2], bbox[3],
bbox_ytzje[4], bbox[5], bbox_ytzje[6], bbox[7]]
_iou, _key = self.get_top_iou(poly=anchor_1, ocr_result=self.ocr_results[page_num])
bbox, ytzje_text = self.ocr_results[page_num][_key]
# print(ytzje_text)
anchor_2 = [bbox_dkbj[0], bbox[1], bbox_dkbj[2], bbox[3],
bbox_dkbj[4], bbox[5], bbox_dkbj[6], bbox[7]]
_iou, _key = self.get_top_iou(poly=anchor_2, ocr_result=self.ocr_results[page_num])
bbox, dkbj_text = self.ocr_results[page_num][_key]
# print(dkbj_text)
if xm_text == ytzje_text:
xm_text, ytzje_text = xm_text.split(' ')
line = [xm_text, ytzje_text, dkbj_text]
asp_details_table.append(line)
else:
break
if bbox_total:
anchor = [bbox_total[0], bbox[1], bbox_total[2], bbox[3],
bbox_total[4], bbox[5], bbox_total[6], bbox[7]]
_iou, _key = self.get_top_iou(poly=anchor_2, ocr_result=self.ocr_results[page_num])
bbox, total_text = self.ocr_results[page_num][_key]
asp_details_table.append(['附加产品融资贷款本金总金额:', '', total_text])
asp_details_table_term['words'] = asp_details_table
return asp_details_table_term
......@@ -678,163 +730,178 @@ class Finder:
if '附加产品融资贷款本金总金额' in text:
self.is_asp = True
self.gen_init_result(self.is_asp)
# Page 1
# 找合同编号
contract_no = self.get_contract_no(page_num='0')
self.init_result['page_1']['合同编号'] = contract_no
# 所购车辆价格
vehicle_price = self.get_vehicle_price()
self.init_result['page_1']['所购车辆价格'] = vehicle_price
# 车架号
vin = self.get_vin()
self.init_result['page_1']['车架号'] = vin
# 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
upper, lower, asp_1, asp_2 = self.get_loan_principal()
self.init_result['page_1']['贷款本金金额']['大写'] = upper
self.init_result['page_1']['贷款本金金额']['小写'] = lower
self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1
self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
# 贷款期限
loan_term = self.get_loan_term()
self.init_result['page_1']['贷款期限'] = loan_term
# 附加产品融资贷款本金总金额明细(ASP-表格)
asp_details_table = self.get_asp_details(page_num='0')
self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table
# 借款人签字及时间
signature = self.get_signature()
self.init_result['page_1']['借款人签字及时间'] = signature
#######################################
# Page 2
# 找合同编号
contract_no = self.get_contract_no(page_num='0')
self.init_result['page_2']['合同编号'] = contract_no
# 找借款人及抵押人(地址字段原本有空格)
borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人:')
# 这是为了同时兼容 8.1 版本
if borrower_name['words'] == None:
borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:')
self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name
self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id
# 找共同借款人及共同抵押人
co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人:', bottom='保证人1:')
self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name
self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id
# 保证人1
first_guarantor_name, first_guarantor_id = self.get_somebody(top='保证人1:', bottom='保证人2:')
self.init_result['page_2']['保证人1']['name'] = first_guarantor_name
self.init_result['page_2']['保证人1']['id'] = first_guarantor_id
# 保证人2
second_guarantor_name, second_guarantor_id = self.get_somebody(top='保证人2:', bottom='第一章')
self.init_result['page_2']['保证人2']['name'] = second_guarantor_name
self.init_result['page_2']['保证人2']['id'] = second_guarantor_id
# 所购车辆价格
vehicle_price = self.get_vehicle_price(page_num='1')
self.init_result['page_2']['所购车辆价格'] = vehicle_price
# 车架号
vin = self.get_vin(page_num='1')
self.init_result['page_2']['车架号'] = vin
# 经销商
seller = self.get_seller()
self.init_result['page_2']['经销商'] = seller
# 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1')
self.init_result['page_2']['贷款本金金额']['大写'] = upper
self.init_result['page_2']['贷款本金金额']['小写'] = lower
self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1
self.init_result['page_2']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
# 贷款期限
loan_term = self.get_loan_term(page_num='1')
self.init_result['page_2']['贷款期限'] = loan_term
# 本合同当期的标准利率
standard_rate = self.get_standard_rate(page_num='1')
self.init_result['page_2']['标准利率'] = standard_rate
# 还款账户
account, account_name, account_bank = self.get_payback_account()
self.init_result['page_2']['还款账户']['账号'] = account
self.init_result['page_2']['还款账户']['户名'] = account_name
self.init_result['page_2']['还款账户']['开户行'] = account_bank
#######################################
# Page 3
# 找合同编号
contract_no = self.get_contract_no(page_num='2')
self.init_result['page_3']['合同编号'] = contract_no
# 还款计划表(表格)
repayment_schedule_table = self.get_repayment_schedule()
self.init_result['page_3']['还款计划表'] = repayment_schedule_table
#######################################
# Page 4
# 找合同编号
contract_no = self.get_contract_no(page_num='3')
self.init_result['page_4']['合同编号'] = contract_no
# 附加产品融资贷款本金总金额明细(ASP-表格)
asp_details_table = self.get_asp_details(page_num='3')
self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table
#######################################
# Page 5
# 找合同编号
contract_no = self.get_contract_no(page_num='4')
self.init_result['page_5']['合同编号'] = contract_no
#######################################
# Page 6
# 找合同编号
contract_no = self.get_contract_no(page_num='5')
self.init_result['page_6']['合同编号'] = contract_no
if self.is_asp == False:
# Page 7
if len(list(self.ocr_results.keys())) <= 8: # 8.5 版本客户提供的样本出现串页的情况,暂时无法识别
# Page 1
# 找合同编号
contract_no = self.get_contract_no(page_num='6')
self.init_result['page_7']['合同编号'] = contract_no
signature_name, signature_date = self.get_last_page_signature(page_num='6',
top='合同编号', bottom='共同借款人')
self.init_result['page_7']['主借人签字']['签字'] = signature_name
self.init_result['page_7']['主借人签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='6',
top='共同借款人', bottom='保证人1')
self.init_result['page_7']['共借人签字']['签字'] = signature_name
self.init_result['page_7']['共借人签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='6',
top='保证人1', bottom='保证人2')
self.init_result['page_7']['保证人1签字']['签字'] = signature_name
self.init_result['page_7']['保证人1签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='6',
top='保证人2', bottom='在本人面前亲笔签署本合同')
self.init_result['page_7']['保证人2签字']['签字'] = signature_name
self.init_result['page_7']['保证人2签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='6',
top='在本人面前亲笔签署本合同', bottom='以下无正文')
self.init_result['page_7']['见证人签字']['签字'] = signature_name
self.init_result['page_7']['见证人签字']['日期'] = signature_date
else:
# Page 7
contract_no = self.get_contract_no(page_num='0')
# print(contract_no)
self.init_result['page_1']['合同编号'] = contract_no
# 所购车辆价格
vehicle_price = self.get_vehicle_price()
# print(vehicle_price)
self.init_result['page_1']['所购车辆价格'] = vehicle_price
# 车架号
vin = self.get_vin()
# print(vin)
self.init_result['page_1']['车架号'] = vin
# 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
upper, lower, asp_1, asp_2 = self.get_loan_principal()
# print(upper, lower, asp_1, asp_2)
self.init_result['page_1']['贷款本金金额']['大写'] = upper
self.init_result['page_1']['贷款本金金额']['小写'] = lower
self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1
self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
# 贷款期限
loan_term = self.get_loan_term()
# print(loan_term)
self.init_result['page_1']['贷款期限'] = loan_term
# 附加产品融资贷款本金总金额明细(ASP-表格)
asp_details_table = self.get_asp_details(page_num='0')
# print(asp_details_table)
self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table
# 借款人签字及时间
signature = self.get_signature()
# print(signature)
self.init_result['page_1']['借款人签字及时间'] = signature
#######################################
# Page 2
# 找合同编号
contract_no = self.get_contract_no(page_num='0')
# print(contract_no)
self.init_result['page_2']['合同编号'] = contract_no
# 找借款人及抵押人(地址字段原本有空格)
borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人:')
# print(borrower_name, borrower_id)
# 这是为了同时兼容 8.1 版本
if borrower_name['words'] == None:
borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:')
self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name
self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id
# 找共同借款人及共同抵押人
co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人:', bottom='保证人1:')
# print(co_borrower_name, co_borrower_id)
self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name
self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id
# 保证人1
first_guarantor_name, first_guarantor_id = self.get_somebody(top='保证人1:', bottom='保证人2:')
self.init_result['page_2']['保证人1']['name'] = first_guarantor_name
self.init_result['page_2']['保证人1']['id'] = first_guarantor_id
# 保证人2
second_guarantor_name, second_guarantor_id = self.get_somebody(top='保证人2:', bottom='第一章')
self.init_result['page_2']['保证人2']['name'] = second_guarantor_name
self.init_result['page_2']['保证人2']['id'] = second_guarantor_id
# 所购车辆价格
vehicle_price = self.get_vehicle_price(page_num='1')
self.init_result['page_2']['所购车辆价格'] = vehicle_price
# 车架号
vin = self.get_vin(page_num='1')
self.init_result['page_2']['车架号'] = vin
# 经销商
seller = self.get_seller()
self.init_result['page_2']['经销商'] = seller
# 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1')
# print(upper, lower, asp_1, asp_2)
self.init_result['page_2']['贷款本金金额']['大写'] = upper
self.init_result['page_2']['贷款本金金额']['小写'] = lower
self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1
self.init_result['page_2']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
# 贷款期限
loan_term = self.get_loan_term(page_num='1')
self.init_result['page_2']['贷款期限'] = loan_term
# 本合同当期的标准利率
standard_rate = self.get_standard_rate(page_num='1')
self.init_result['page_2']['标准利率'] = standard_rate
# 还款账户
account, account_name, account_bank = self.get_payback_account()
# print(account, account_name, account_bank)
self.init_result['page_2']['还款账户']['账号'] = account
self.init_result['page_2']['还款账户']['户名'] = account_name
self.init_result['page_2']['还款账户']['开户行'] = account_bank
#######################################
# Page 3
# 找合同编号
contract_no = self.get_contract_no(page_num='6')
self.init_result['page_7']['合同编号'] = contract_no
# Page 8
contract_no = self.get_contract_no(page_num='2')
self.init_result['page_3']['合同编号'] = contract_no
# 还款计划表(表格)
repayment_schedule_table = self.get_repayment_schedule()
# print(repayment_schedule_table)
self.init_result['page_3']['还款计划表'] = repayment_schedule_table
#######################################
# Page 4
# 找合同编号
contract_no = self.get_contract_no(page_num='7')
self.init_result['page_8']['合同编号'] = contract_no
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='合同编号', bottom='共同借款人')
self.init_result['page_8']['主借人签字']['签字'] = signature_name
self.init_result['page_8']['主借人签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='共同借款人', bottom='保证人1')
self.init_result['page_8']['共借人签字']['签字'] = signature_name
self.init_result['page_8']['共借人签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='保证人1', bottom='保证人2')
self.init_result['page_8']['保证人1签字']['签字'] = signature_name
self.init_result['page_8']['保证人1签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='保证人2', bottom='在本人面前亲笔签署本合同')
self.init_result['page_8']['保证人2签字']['签字'] = signature_name
self.init_result['page_8']['保证人2签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='在本人面前亲笔签署本合同', bottom='以下无正文')
self.init_result['page_8']['见证人签字']['签字'] = signature_name
self.init_result['page_8']['见证人签字']['日期'] = signature_date
contract_no = self.get_contract_no(page_num='3')
self.init_result['page_4']['合同编号'] = contract_no
# 附加产品融资贷款本金总金额明细(ASP-表格)
asp_details_table = self.get_asp_details(page_num='3')
# print(asp_details_table)
self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table
#######################################
# Page 5
# 找合同编号
contract_no = self.get_contract_no(page_num='4')
self.init_result['page_5']['合同编号'] = contract_no
#######################################
# Page 6
# 找合同编号
contract_no = self.get_contract_no(page_num='5')
self.init_result['page_6']['合同编号'] = contract_no
if self.is_asp == False:
# Page 7
# 找合同编号
contract_no = self.get_contract_no(page_num='6')
self.init_result['page_7']['合同编号'] = contract_no
signature_name, signature_date = self.get_last_page_signature(page_num='6',
top='合同编号', bottom='共同借款人')
self.init_result['page_7']['主借人签字']['签字'] = signature_name
self.init_result['page_7']['主借人签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='6',
top='共同借款人', bottom='保证人1')
self.init_result['page_7']['共借人签字']['签字'] = signature_name
self.init_result['page_7']['共借人签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='6',
top='保证人1', bottom='保证人2')
self.init_result['page_7']['保证人1签字']['签字'] = signature_name
self.init_result['page_7']['保证人1签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='6',
top='保证人2', bottom='在本人面前亲笔签署本合同')
self.init_result['page_7']['保证人2签字']['签字'] = signature_name
self.init_result['page_7']['保证人2签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='6',
top='在本人面前亲笔签署本合同', bottom='以下无正文')
self.init_result['page_7']['见证人签字']['签字'] = signature_name
self.init_result['page_7']['见证人签字']['日期'] = signature_date
else:
# Page 7
# 找合同编号
contract_no = self.get_contract_no(page_num='6')
self.init_result['page_7']['合同编号'] = contract_no
# Page 8
# 找合同编号
contract_no = self.get_contract_no(page_num='7')
self.init_result['page_8']['合同编号'] = contract_no
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='合同编号', bottom='共同借款人')
self.init_result['page_8']['主借人签字']['签字'] = signature_name
self.init_result['page_8']['主借人签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='共同借款人', bottom='保证人1')
self.init_result['page_8']['共借人签字']['签字'] = signature_name
self.init_result['page_8']['共借人签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='保证人1', bottom='保证人2')
self.init_result['page_8']['保证人1签字']['签字'] = signature_name
self.init_result['page_8']['保证人1签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='保证人2', bottom='在本人面前亲笔签署本合同')
self.init_result['page_8']['保证人2签字']['签字'] = signature_name
self.init_result['page_8']['保证人2签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='在本人面前亲笔签署本合同', bottom='以下无正文')
self.init_result['page_8']['见证人签字']['签字'] = signature_name
self.init_result['page_8']['见证人签字']['日期'] = signature_date
# 重新定制输出
new_results = {"is_asp": self.is_asp,
"page_info": self.init_result
}
return new_results
return new_results
\ No newline at end of file
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!