87525e99 by 周伟奇

fix afc e-contract

1 parent 9bab1769
......@@ -9,8 +9,23 @@ from .get_char import Finder
def predict(pdf_info):
ocr_results = {}
for pno in pdf_info:
ocr_results[pno] = {}
for key, block in enumerate(pdf_info[pno]['blocks']):
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
# print(text)
xmin, ymin, xmax, ymax = bbox
polygon = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax]
text = text.replace(":", ":").replace(" ", "")
ocr_results[pno][key] = [polygon, text]
# 输入是整个 PDF 中的信息
f = Finder(pdf_info)
f = Finder(pdf_info, ocr_results=ocr_results)
results = f.get_info()
return results
......
......@@ -11,14 +11,13 @@ from fuzzywuzzy import fuzz
class Finder:
def __init__(self, pdf_info):
def __init__(self, pdf_info, ocr_results):
self.pdf_info = pdf_info
self.ocr_results = ocr_results
self.is_asp = False
self.item = {"words": None,
"position": None,
}
def gen_init_result(self, is_asp):
# 格式化算法输出
self.init_result = {"page_1": {"合同编号": self.item,
......@@ -109,8 +108,10 @@ class Finder:
"日期": self.item,
},
}
def poly_to_rectangle(self, poly):
xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly
bbox = [xmin, ymin, xmax, ymax]
return bbox
def get_contract_no(self, page_num):
"""传入页码,查看该页码右上角的编号
......@@ -121,47 +122,41 @@ class Finder:
sting:
"""
contract_no = self.item.copy()
# contract_no['words'] = ''
# contract_no['position'] = [-1, -1, -1, -1]
# 只看第一页
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '合同编号:' in text:
words = text.split(':')[-1]
contract_no['position'] = bbox
contract_no['words'] = words
for key in self.ocr_results[page_num]:
bbox, text = self.ocr_results[page_num][key]
if '合同编号:' in text:
words = text.split(':')[-1]
location = self.poly_to_rectangle(bbox)
contract_no['words'] = words
contract_no['position'] = location
return contract_no
def get_vehicle_price(self, page_num='0'):
vehicle_price = self.item.copy()
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '所购车辆价格为人民币' in text:
words = text.split('币')[-1]
vehicle_price['position'] = bbox
vehicle_price['words'] = words
# vehicle_price['words'] = ''
# vehicle_price['position'] = [-1, -1, -1, -1]
for key in self.ocr_results[page_num]:
bbox, text = self.ocr_results[page_num][key]
if '所购车辆价格为人民币' in text:
words = text.split('币')[-1]
location = self.poly_to_rectangle(bbox)
vehicle_price['words'] = words
vehicle_price['position'] = location
return vehicle_price
def get_vin(self, page_num='0'):
vin = self.item.copy()
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '车架号:' in text:
words = text.split(':')[-1]
vin['position'] = bbox
vin['words'] = words
# vin['words'] = ''
# vin['position'] = [-1, -1, -1, -1]
for key in self.ocr_results[page_num]:
bbox, text = self.ocr_results[page_num][key]
if '车架号:' in text:
words = text.split(':')[-1]
location = self.poly_to_rectangle(bbox)
vin['words'] = words
vin['position'] = location
return vin
def get_loan_principal(self, page_num='0'):
chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾',
'佰', '仟', '万', '亿', '元', '角', '分', '零', '整']
......@@ -202,7 +197,6 @@ class Finder:
asp_2['position'] = bbox
asp_2['words'] = words
return upper, lower, asp_1, asp_2
def get_loan_term(self, page_num='0'):
loan_term = self.item.copy()
all_text = ''
......@@ -226,10 +220,20 @@ class Finder:
loan_term['position'] = bbox
loan_term['words'] = words
return loan_term
def mergelist(self, text_list):
pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
mergeindex = -1
for index, i in enumerate(text_list):
if '所购' in i and len(pattern.sub('', pattern.sub('', text_list[index+1]))) != 0:
# if '所购' in i and '.00' not in text_list[index+1]:
mergeindex = index
if mergeindex == -1:
return text_list
else:
new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex+1]] + text_list[mergeindex+2:]
return self.mergelist(new_text_list)
def get_asp_details(self, page_num):
asp_details_table_term = self.item.copy()
asp_details_table = []
asp_details_text_list = []
table = False
......@@ -244,26 +248,22 @@ class Finder:
if '第二条' in text or '征信管理' in text:
table = False
if table == True:
# print(text)
asp_details_text_list.append(text)
asp_details_text_list = self.mergelist(asp_details_text_list)
for i in range((len(asp_details_text_list)+2)//3):
line = []
if i == 0:
line = [asp_details_text_list[0]]
else:
for j in range(3):
line.append(asp_details_text_list[i*3-2+j])
asp_details_table.append(line)
if len(asp_details_table) > 0:
asp_details_table_term['words'] = asp_details_table
return asp_details_table_term
def get_signature(self):
signature = self.item.copy()
for block in self.pdf_info['0']['blocks']:
if block['type'] != 0:
continue
......@@ -275,7 +275,6 @@ class Finder:
signature['words'] = words
signature['position'] = bbox
return signature
def get_somebody(self, top, bottom):
# 指定上下边界后,返回上下边界内的客户信息
_name = self.item.copy()
......@@ -300,6 +299,7 @@ class Finder:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if y_top < bbox[3] < y_bottom:
# print(top, bottom, text)
if '姓名/名称' in text:
words = text.split(':')[-1]
_name['position'] = bbox
......@@ -309,7 +309,6 @@ class Finder:
_id['position'] = bbox
_id['words'] = words
return _name, _id
def get_seller(self):
seller = self.item.copy()
# 先找到 key
......@@ -336,7 +335,6 @@ class Finder:
seller['position'] = bbox
seller['words'] = text
return seller
def get_payback_account(self):
account = self.item.copy()
account_name = self.item.copy()
......@@ -389,7 +387,6 @@ class Finder:
account_bank['position'] = bbox
account_bank['words'] = words
return account, account_name, account_bank
def get_repayment_schedule(self):
repayment_schedule = self.item.copy()
# 只看第二页
......@@ -408,23 +405,17 @@ class Finder:
table = False
if table == True:
repayment_schedule_text_list.append(text)
for i in range(len(repayment_schedule_text_list)//5):
line = []
# 5表示5列的意思
for j in range(5):
line.append(repayment_schedule_text_list[i*5+j])
if str(i+1) == line[1]:
break
repayment_schedule_table.append(line)
if len(repayment_schedule_table) > 0:
repayment_schedule['words'] = repayment_schedule_table
return repayment_schedule
def get_signature_role_1(self):
signature_role_1 = self.init_item.copy()
# 先定位签字区域
......@@ -459,7 +450,6 @@ class Finder:
signature_role_1['position'] = position
signature_role_1['words'] = words
return signature_role_1
def get_signature_role_2(self):
signature_role_2 = self.init_item.copy()
# 先定位签字区域
......@@ -494,7 +484,6 @@ class Finder:
signature_role_2['position'] = position
signature_role_2['words'] = words
return signature_role_2
def get_signature_role_3(self):
signature_role_3 = self.init_item.copy()
# 先定位签字区域
......@@ -529,7 +518,6 @@ class Finder:
signature_role_3['position'] = position
signature_role_3['words'] = words
return signature_role_3
def get_signature_role_4(self):
signature_role_4 = self.init_item.copy()
# 先定位签字区域
......@@ -564,7 +552,6 @@ class Finder:
signature_role_4['position'] = position
signature_role_4['words'] = words
return signature_role_4
def get_signature_role_5(self):
signature_role_5 = self.init_item.copy()
# 先定位签字区域
......@@ -600,7 +587,6 @@ class Finder:
signature_role_5['position'] = position
signature_role_5['words'] = words
return signature_role_5
def get_last_page_signature(self, page_num, top, bottom):
signature_name = self.item.copy()
signature_date = self.item.copy()
......@@ -616,6 +602,7 @@ class Finder:
anchor_top = bbox[1]
if bottom in text:
anchor_bottom = bbox[1]
# print(top, anchor_top, anchor_bottom)
if anchor_top is not None and anchor_bottom is not None:
for block in self.pdf_info[page_num]['blocks']:
if block['type'] != 0:
......@@ -629,9 +616,8 @@ class Finder:
signature_name['words'] = name
signature_name['position'] = bbox
signature_date['words'] = date
signature_name['position'] = bbox
signature_date['position'] = bbox
return signature_name, signature_date
def get_info(self):
"""
block['type'] == 0 : 表示该元素为图片
......@@ -639,21 +625,22 @@ class Finder:
Returns:
dict: Description
"""
# 先判断是否为 ASP 产品
# 只看第一页,判断是否有 '附加产品融资贷款本金总金额' 这一句话,若有则为 ASP 产品
# print(self.pdf_info['0']['blocks'])
for block in self.pdf_info['0']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '附加产品融资贷款本金总金额' == text:
self.is_asp = True
# for block in self.pdf_info['0']['blocks']:
# if block['type'] != 0:
# continue
# for line in block['lines']:
# for span in line['spans']:
# bbox, text = span['bbox'], span['text']
# if '附加产品融资贷款本金总金额' == text:
# self.is_asp = True
for key in self.ocr_results['0']:
bbox, text = self.ocr_results['0'][key]
if '附加产品融资贷款本金总金额' in text:
self.is_asp = True
self.gen_init_result(self.is_asp)
# Page 1
# 找合同编号
contract_no = self.get_contract_no(page_num='0')
......@@ -663,7 +650,7 @@ class Finder:
self.init_result['page_1']['所购车辆价格'] = vehicle_price
# 车架号
vin = self.get_vin()
self.init_result['page_1']['车架号'] = vehicle_price
self.init_result['page_1']['车架号'] = vin
# 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
upper, lower, asp_1, asp_2 = self.get_loan_principal()
self.init_result['page_1']['贷款本金金额']['大写'] = upper
......@@ -685,11 +672,14 @@ class Finder:
contract_no = self.get_contract_no(page_num='0')
self.init_result['page_2']['合同编号'] = contract_no
# 找借款人及抵押人(地址字段原本有空格)
borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:')
borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人:')
# 这是为了同时兼容 8.1 版本
if borrower_name['words'] == None:
borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:')
self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name
self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id
# 找共同借款人及共同抵押人
co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人及共同抵押人:', bottom='保证人1:')
co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人:', bottom='保证人1:')
self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name
self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id
# 保证人1
......@@ -755,11 +745,11 @@ class Finder:
contract_no = self.get_contract_no(page_num='6')
self.init_result['page_7']['合同编号'] = contract_no
signature_name, signature_date = self.get_last_page_signature(page_num='6',
top='借款人(抵押人)', bottom='共同借款人(共同抵押人)')
top='合同编号', bottom='共同借款人')
self.init_result['page_7']['主借人签字']['签字'] = signature_name
self.init_result['page_7']['主借人签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='6',
top='共同借款人(共同抵押人)', bottom='保证人1')
top='共同借款人', bottom='保证人1')
self.init_result['page_7']['共借人签字']['签字'] = signature_name
self.init_result['page_7']['共借人签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='6',
......@@ -771,7 +761,7 @@ class Finder:
self.init_result['page_7']['保证人2签字']['签字'] = signature_name
self.init_result['page_7']['保证人2签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='6',
top='在本人面前亲笔签署本合同', bottom='(以下无正文)')
top='在本人面前亲笔签署本合同', bottom='以下无正文')
self.init_result['page_7']['见证人签字']['签字'] = signature_name
self.init_result['page_7']['见证人签字']['日期'] = signature_date
else:
......@@ -784,11 +774,11 @@ class Finder:
contract_no = self.get_contract_no(page_num='7')
self.init_result['page_8']['合同编号'] = contract_no
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='借款人(抵押人)', bottom='共同借款人(共同抵押人)')
top='合同编号', bottom='共同借款人')
self.init_result['page_8']['主借人签字']['签字'] = signature_name
self.init_result['page_8']['主借人签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='共同借款人(共同抵押人)', bottom='保证人1')
top='共同借款人', bottom='保证人1')
self.init_result['page_8']['共借人签字']['签字'] = signature_name
self.init_result['page_8']['共借人签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
......@@ -800,10 +790,9 @@ class Finder:
self.init_result['page_8']['保证人2签字']['签字'] = signature_name
self.init_result['page_8']['保证人2签字']['日期'] = signature_date
signature_name, signature_date = self.get_last_page_signature(page_num='7',
top='在本人面前亲笔签署本合同', bottom='(以下无正文)')
top='在本人面前亲笔签署本合同', bottom='以下无正文')
self.init_result['page_8']['见证人签字']['签字'] = signature_name
self.init_result['page_8']['见证人签字']['日期'] = signature_date
# 重新定制输出
new_results = {"is_asp": self.is_asp,
"page_info": self.init_result
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!