fd556337 by 周伟奇

add contract 8.5

1 parent 2dc31fab
...@@ -2970,6 +2970,8 @@ def compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True ...@@ -2970,6 +2970,8 @@ def compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True
2970 auto_obj = auto_class.objects.filter(application_id=application_id, on_off=True).first() 2970 auto_obj = auto_class.objects.filter(application_id=application_id, on_off=True).first()
2971 if auto_obj is not None: 2971 if auto_obj is not None:
2972 auto_result = se_compare_auto(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, auto_obj) 2972 auto_result = se_compare_auto(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, auto_obj)
2973 else:
2974 auto_result = None
2973 2975
2974 full_result = se_compare(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, is_cms, auto_result) 2976 full_result = se_compare(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, is_cms, auto_result)
2975 2977
......
...@@ -6,28 +6,35 @@ ...@@ -6,28 +6,35 @@
6 # @Description : 6 # @Description :
7 7
8 from .get_char import Finder 8 from .get_char import Finder
9 import numpy as np
9 10
10 11
11 def predict(pdf_info): 12 def predict(pdf_info):
12 ocr_results = {} 13 ocr_results = {}
13 for pno in pdf_info: 14 for pno in pdf_info:
14 ocr_results[pno] = {} 15 ocr_results[pno] = {}
16 ocr_result = []
15 for key, block in enumerate(pdf_info[pno]['blocks']): 17 for key, block in enumerate(pdf_info[pno]['blocks']):
16 if block['type'] != 0: 18 if block['type'] != 0:
17 continue 19 continue
18 for line in block['lines']: 20 for line in block['lines']:
19 for span in line['spans']: 21 for span in line['spans']:
20 bbox, text = span['bbox'], span['text'] 22 bbox, text = span['bbox'], span['text']
23 if len(text) == 0:
24 continue
21 # print(text) 25 # print(text)
22 xmin, ymin, xmax, ymax = bbox 26 xmin, ymin, xmax, ymax = bbox
23 polygon = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax] 27 polygon = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax]
28 polygon = np.array(polygon, dtype=np.int32).tolist()
24 text = text.replace(":", ":").replace(" ", "") 29 text = text.replace(":", ":").replace(" ", "")
25 ocr_results[pno][key] = [polygon, text] 30 ocr_result.append([polygon, text])
26 31 ocr_result = sorted(ocr_result, key=lambda x: x[0][1], reverse=False) # 按 y0 从小到大排
32 keys = list(range(len(ocr_result)))
33 ocr_result = dict(zip(keys, ocr_result))
34 ocr_results[pno] = ocr_result
27 # 输入是整个 PDF 中的信息 35 # 输入是整个 PDF 中的信息
28 f = Finder(pdf_info, ocr_results=ocr_results) 36 f = Finder(pdf_info, ocr_results=ocr_results)
29 results = f.get_info() 37 results = f.get_info()
30
31 return results 38 return results
32 39
33 40
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
8 import re 8 import re
9 import numpy as np 9 import numpy as np
10 from fuzzywuzzy import fuzz 10 from fuzzywuzzy import fuzz
11 from shapely.geometry import Polygon
11 12
12 13
13 class Finder: 14 class Finder:
...@@ -111,6 +112,28 @@ class Finder: ...@@ -111,6 +112,28 @@ class Finder:
111 }, 112 },
112 } 113 }
113 114
115 def get_top_iou(self, poly, ocr_result):
116 """传入一个多边形, 找到与之最匹配的多边形
117
118 Args:
119 poly (TYPE): Description
120 """
121 iou_list = []
122 for key in ocr_result:
123 bbox, text = ocr_result[key]
124 g = Polygon(np.array(bbox).reshape((-1, 2)))
125 p = Polygon(np.array(poly).reshape((-1, 2)))
126 if not g.is_valid or not p.is_valid:
127 continue
128 inter = Polygon(g).intersection(Polygon(p)).area
129 union = g.area + p.area - inter
130 iou = inter / union
131 iou_list.append([iou, key])
132 if len(iou_list) == 0:
133 return -1, -1
134 top_iou = sorted(iou_list, key=lambda x: x[0])[-1]
135 return top_iou
136
114 def poly_to_rectangle(self, poly): 137 def poly_to_rectangle(self, poly):
115 xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly 138 xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly
116 bbox = [xmin, ymin, xmax, ymax] 139 bbox = [xmin, ymin, xmax, ymax]
...@@ -253,38 +276,67 @@ class Finder: ...@@ -253,38 +276,67 @@ class Finder:
253 if mergeindex == -1: 276 if mergeindex == -1:
254 return text_list 277 return text_list
255 else: 278 else:
256 new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex + 1]] + \ 279 new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex + 1]] + text_list[
257 text_list[mergeindex + 2:] 280 mergeindex + 2:]
258 return self.mergelist(new_text_list) 281 return self.mergelist(new_text_list)
259 282
260 def get_asp_details(self, page_num): 283 def get_asp_details(self, page_num):
261 asp_details_table_term = self.item.copy() 284 asp_details_table_term = self.item.copy()
262 asp_details_table = [] 285 asp_details_table = [['附加产品融资贷款本金总金额及贷款利率明细'], ['项目1', '用途总金额2', '贷款本金3']]
263 asp_details_text_list = [] 286 bbox_xm = None
264 table = False 287 bbox_ytzje = None
265 for block in self.pdf_info[page_num]['blocks']: 288 bbox_dkbj = None
266 if block['type'] != 0: 289 bbox_total = None
290 for key in self.ocr_results[page_num]:
291 bbox, text = self.ocr_results[page_num][key]
292 if text == '项目1':
293 bbox_xm = bbox
294 if text == '用途总金额2':
295 bbox_ytzje = bbox
296 if text == '贷款本金3':
297 bbox_dkbj = bbox
298 if text == '附加产品融资贷款本':
299 bbox_total = bbox
300 # print(bbox_xm, bbox_ytzje, bbox_dkbj, bbox_total)
301 if bbox_xm:
302 for i in range(10):
303 rh = abs(bbox_xm[1] - bbox_xm[-1])
304 anchor = np.array(bbox_xm).reshape((-1, 2))
305 anchor[:, 1] += int(rh * 1.4)
306 _iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num])
307 if _iou > 0:
308 bbox, xm_text = self.ocr_results[page_num][_key]
309 bbox_xm = bbox
310 # 解决项目内容是两行的问题
311 if not '所购' in xm_text:
312 line = asp_details_table[-1]
313 line[0] += xm_text
314 asp_details_table[-1] = line
267 continue 315 continue
268 for line in block['lines']: 316 # print(xm_text)
269 for span in line['spans']: 317 anchor_1 = [bbox_ytzje[0], bbox[1], bbox_ytzje[2], bbox[3],
270 bbox, text = span['bbox'], span['text'] 318 bbox_ytzje[4], bbox[5], bbox_ytzje[6], bbox[7]]
271 if '附加产品融资贷款本金总金额明细' == text: 319 _iou, _key = self.get_top_iou(poly=anchor_1, ocr_result=self.ocr_results[page_num])
272 table = True 320 bbox, ytzje_text = self.ocr_results[page_num][_key]
273 if '第二条' in text or '征信管理' in text: 321 # print(ytzje_text)
274 table = False 322 anchor_2 = [bbox_dkbj[0], bbox[1], bbox_dkbj[2], bbox[3],
275 if table == True: 323 bbox_dkbj[4], bbox[5], bbox_dkbj[6], bbox[7]]
276 # print(text) 324 _iou, _key = self.get_top_iou(poly=anchor_2, ocr_result=self.ocr_results[page_num])
277 asp_details_text_list.append(text) 325 bbox, dkbj_text = self.ocr_results[page_num][_key]
278 asp_details_text_list = self.mergelist(asp_details_text_list) 326 # print(dkbj_text)
279 for i in range((len(asp_details_text_list) + 2) // 3): 327 if xm_text == ytzje_text:
280 line = [] 328 xm_text, ytzje_text = xm_text.split(' ')
281 if i == 0: 329 line = [xm_text, ytzje_text, dkbj_text]
282 line = [asp_details_text_list[0]]
283 else:
284 for j in range(3):
285 line.append(asp_details_text_list[i * 3 - 2 + j])
286 asp_details_table.append(line) 330 asp_details_table.append(line)
287 if len(asp_details_table) > 0: 331 else:
332 break
333
334 if bbox_total:
335 anchor = [bbox_total[0], bbox[1], bbox_total[2], bbox[3],
336 bbox_total[4], bbox[5], bbox_total[6], bbox[7]]
337 _iou, _key = self.get_top_iou(poly=anchor_2, ocr_result=self.ocr_results[page_num])
338 bbox, total_text = self.ocr_results[page_num][_key]
339 asp_details_table.append(['附加产品融资贷款本金总金额:', '', total_text])
288 asp_details_table_term['words'] = asp_details_table 340 asp_details_table_term['words'] = asp_details_table
289 return asp_details_table_term 341 return asp_details_table_term
290 342
...@@ -678,38 +730,48 @@ class Finder: ...@@ -678,38 +730,48 @@ class Finder:
678 if '附加产品融资贷款本金总金额' in text: 730 if '附加产品融资贷款本金总金额' in text:
679 self.is_asp = True 731 self.is_asp = True
680 self.gen_init_result(self.is_asp) 732 self.gen_init_result(self.is_asp)
733 if len(list(self.ocr_results.keys())) <= 8: # 8.5 版本客户提供的样本出现串页的情况,暂时无法识别
681 # Page 1 734 # Page 1
682 # 找合同编号 735 # 找合同编号
683 contract_no = self.get_contract_no(page_num='0') 736 contract_no = self.get_contract_no(page_num='0')
737 # print(contract_no)
684 self.init_result['page_1']['合同编号'] = contract_no 738 self.init_result['page_1']['合同编号'] = contract_no
685 # 所购车辆价格 739 # 所购车辆价格
686 vehicle_price = self.get_vehicle_price() 740 vehicle_price = self.get_vehicle_price()
741 # print(vehicle_price)
687 self.init_result['page_1']['所购车辆价格'] = vehicle_price 742 self.init_result['page_1']['所购车辆价格'] = vehicle_price
688 # 车架号 743 # 车架号
689 vin = self.get_vin() 744 vin = self.get_vin()
745 # print(vin)
690 self.init_result['page_1']['车架号'] = vin 746 self.init_result['page_1']['车架号'] = vin
691 # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目 747 # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
692 upper, lower, asp_1, asp_2 = self.get_loan_principal() 748 upper, lower, asp_1, asp_2 = self.get_loan_principal()
749 # print(upper, lower, asp_1, asp_2)
693 self.init_result['page_1']['贷款本金金额']['大写'] = upper 750 self.init_result['page_1']['贷款本金金额']['大写'] = upper
694 self.init_result['page_1']['贷款本金金额']['小写'] = lower 751 self.init_result['page_1']['贷款本金金额']['小写'] = lower
695 self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1 752 self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1
696 self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2 753 self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
697 # 贷款期限 754 # 贷款期限
698 loan_term = self.get_loan_term() 755 loan_term = self.get_loan_term()
756 # print(loan_term)
699 self.init_result['page_1']['贷款期限'] = loan_term 757 self.init_result['page_1']['贷款期限'] = loan_term
700 # 附加产品融资贷款本金总金额明细(ASP-表格) 758 # 附加产品融资贷款本金总金额明细(ASP-表格)
701 asp_details_table = self.get_asp_details(page_num='0') 759 asp_details_table = self.get_asp_details(page_num='0')
760 # print(asp_details_table)
702 self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table 761 self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table
703 # 借款人签字及时间 762 # 借款人签字及时间
704 signature = self.get_signature() 763 signature = self.get_signature()
764 # print(signature)
705 self.init_result['page_1']['借款人签字及时间'] = signature 765 self.init_result['page_1']['借款人签字及时间'] = signature
706 ####################################### 766 #######################################
707 # Page 2 767 # Page 2
708 # 找合同编号 768 # 找合同编号
709 contract_no = self.get_contract_no(page_num='0') 769 contract_no = self.get_contract_no(page_num='0')
770 # print(contract_no)
710 self.init_result['page_2']['合同编号'] = contract_no 771 self.init_result['page_2']['合同编号'] = contract_no
711 # 找借款人及抵押人(地址字段原本有空格) 772 # 找借款人及抵押人(地址字段原本有空格)
712 borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人:') 773 borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人:')
774 # print(borrower_name, borrower_id)
713 # 这是为了同时兼容 8.1 版本 775 # 这是为了同时兼容 8.1 版本
714 if borrower_name['words'] == None: 776 if borrower_name['words'] == None:
715 borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:') 777 borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:')
...@@ -717,6 +779,7 @@ class Finder: ...@@ -717,6 +779,7 @@ class Finder:
717 self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id 779 self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id
718 # 找共同借款人及共同抵押人 780 # 找共同借款人及共同抵押人
719 co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人:', bottom='保证人1:') 781 co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人:', bottom='保证人1:')
782 # print(co_borrower_name, co_borrower_id)
720 self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name 783 self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name
721 self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id 784 self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id
722 # 保证人1 785 # 保证人1
...@@ -738,6 +801,7 @@ class Finder: ...@@ -738,6 +801,7 @@ class Finder:
738 self.init_result['page_2']['经销商'] = seller 801 self.init_result['page_2']['经销商'] = seller
739 # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目 802 # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
740 upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1') 803 upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1')
804 # print(upper, lower, asp_1, asp_2)
741 self.init_result['page_2']['贷款本金金额']['大写'] = upper 805 self.init_result['page_2']['贷款本金金额']['大写'] = upper
742 self.init_result['page_2']['贷款本金金额']['小写'] = lower 806 self.init_result['page_2']['贷款本金金额']['小写'] = lower
743 self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1 807 self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1
...@@ -750,6 +814,7 @@ class Finder: ...@@ -750,6 +814,7 @@ class Finder:
750 self.init_result['page_2']['标准利率'] = standard_rate 814 self.init_result['page_2']['标准利率'] = standard_rate
751 # 还款账户 815 # 还款账户
752 account, account_name, account_bank = self.get_payback_account() 816 account, account_name, account_bank = self.get_payback_account()
817 # print(account, account_name, account_bank)
753 self.init_result['page_2']['还款账户']['账号'] = account 818 self.init_result['page_2']['还款账户']['账号'] = account
754 self.init_result['page_2']['还款账户']['户名'] = account_name 819 self.init_result['page_2']['还款账户']['户名'] = account_name
755 self.init_result['page_2']['还款账户']['开户行'] = account_bank 820 self.init_result['page_2']['还款账户']['开户行'] = account_bank
...@@ -760,6 +825,7 @@ class Finder: ...@@ -760,6 +825,7 @@ class Finder:
760 self.init_result['page_3']['合同编号'] = contract_no 825 self.init_result['page_3']['合同编号'] = contract_no
761 # 还款计划表(表格) 826 # 还款计划表(表格)
762 repayment_schedule_table = self.get_repayment_schedule() 827 repayment_schedule_table = self.get_repayment_schedule()
828 # print(repayment_schedule_table)
763 self.init_result['page_3']['还款计划表'] = repayment_schedule_table 829 self.init_result['page_3']['还款计划表'] = repayment_schedule_table
764 ####################################### 830 #######################################
765 # Page 4 831 # Page 4
...@@ -768,6 +834,7 @@ class Finder: ...@@ -768,6 +834,7 @@ class Finder:
768 self.init_result['page_4']['合同编号'] = contract_no 834 self.init_result['page_4']['合同编号'] = contract_no
769 # 附加产品融资贷款本金总金额明细(ASP-表格) 835 # 附加产品融资贷款本金总金额明细(ASP-表格)
770 asp_details_table = self.get_asp_details(page_num='3') 836 asp_details_table = self.get_asp_details(page_num='3')
837 # print(asp_details_table)
771 self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table 838 self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table
772 ####################################### 839 #######################################
773 # Page 5 840 # Page 5
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!